From b8093942547b02ac99cb72fbb8a390d412a9160f Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 18 May 2020 11:23:06 +0800 Subject: [PATCH 01/98] update doc for PAI --- docs/en_US/TrainingService/PaiMode.md | 12 ++++++------ docs/img/pai_job_submission_page.jpg | Bin 127488 -> 39763 bytes docs/img/pai_profile.jpg | Bin 0 -> 18093 bytes docs/img/pai_token.jpg | Bin 0 -> 18383 bytes docs/img/pai_token_profile.jpg | Bin 55722 -> 0 bytes 5 files changed, 6 insertions(+), 6 deletions(-) create mode 100644 docs/img/pai_profile.jpg create mode 100644 docs/img/pai_token.jpg delete mode 100644 docs/img/pai_token_profile.jpg diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index c608cc970a..9d4e6aedc5 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -7,9 +7,9 @@ Step 1. Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). Step 2. Get PAI token. Click `My profile` button in the top-right side of PAI's webprotal. -![](../../img/pai_token_button.jpg) -Find the token management region, copy one of the token as your account token. -![](../../img/pai_token_profile.jpg) +![](../../img/pai_profile.jpg) +Click `copy` button in the page to copy a jwt token. +![](../../img/pai_token.jpg) Step 3. Mount NFS storage to local machine. Click `Submit job` button in PAI's webportal. @@ -19,7 +19,7 @@ Step 3. Mount NFS storage to local machine. The `DEFAULT_STORAGE`field is the path to be mounted in PAI's container when a job is started. The `Preview container paths` is the NFS host and path that PAI provided, you need to mount the corresponding host and path to your local machine first, then NNI could use the PAI's NFS storage. For example, use the following command: ``` -sudo mount nfs://gcr-openpai-infra02:/pai/data /local/mnt +sudo mount -t nfs4 gcr-openpai-infra02:/pai/data /local/mnt ``` Then the `/data` folder in container will be mounted to `/local/mnt` folder in your local machine. You could use the following configuration in your NNI's config file: @@ -66,7 +66,7 @@ trial: virtualCluster: default nniManagerNFSMountPath: /home/user/mnt containerNFSMountPath: /mnt/data/user - paiStoragePlugin: team_wise + paiStoragePlugin: teamwise_storage # Configuration to access OpenPAI Cluster paiConfig: userName: your_pai_nni_user @@ -74,7 +74,7 @@ paiConfig: host: 10.1.1.1 ``` -Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. +Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. The host field in configuration file is PAI's job submission page uri, like `10.10.5.1`, the default http protocal in NNI is `http`, If your PAI's cluster is enabled https, please use the uri in `https://10.10.5.1` format. Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have these additional keys: * cpuNum diff --git a/docs/img/pai_job_submission_page.jpg b/docs/img/pai_job_submission_page.jpg index f49a1c267eea2a48b986e52c97180b06a524264d..377a66f593c937b1b264d9e838ddb69b96320dd4 100644 GIT binary patch literal 39763 zcmeFZcTkgQ*f)y0iY^GNAgF+>q9R?2w1mX9p&&#lBHcn21eB5hA<;!)0fAKzq$Cj) zA|Sm*TBM0mMVi!@ARsja2!xPwp5VG0-2GXYJ zc7j%}UnBgYGg`&Z?c?t^H(|o&4oO$PJRbF7tDleo)t1^%?^`u5Az4-ggiKZ~S3q!$ z*H`R5T9!-=AC72u zt;T>I_*bGWZykldtdA*Y50GWhv@3|l649{E+;TM*OEyF^u)k#9cYJ6siWa*RMk10q zb9hpaB5O%<_8iu9{V;uz_+H0+9hMr-MiGNzmxnVWMmvcW%<4HIS$oHo#qtHO?Zn&U z8L}LDtt=Q@i9ffrc!z#809-ZcPd@emzH6Vss z^du+;_D8>yAxFW4bB)_I4Py?PYQ7^8b#=QBkhpxN7O0s>QAjYgRjtyjWDAKNY)&tk zkC9mpcdYiwL`J4}8geFU=2lJ`)|GJQ>YTKT5m|v_cLhC0JHe57Ir5}j`~60=p9Xkw zIbww9rv)~*e=8>%4v*ePbXyq{3bhrh9f3zn1g8Aij)a0C1W1UQ$&s3M9^v7l+R#bt zXSU8Q@unp%)i@PsnG7$zmLt<8k#ekI^lJFrN-!mtj@r$k$Zo;i&f=iwNP$s!t6B~w zv`*DvH)OFF7T%aWGpQUCR6?PwubG3S$d&V zrKz%#wF{!)qq=sq;#qOaVQG=zSQYye*V*8s)6ANWXj$}PN4^R{|u4JvI+9A~`Cg=u!%6Vm} z;nCFbPpB`WK9$_-#;2Q{MK8%tO|n-%3f7vCf}MSk?XeERIwAfOEQrrk$J~XP;AAX% zaw0z@c;>cQrv1{(v{kQQ>TOxj4y+^Ro^`uY-o5Y`utZx$aR91H*q$^a8eRq6ZP!Qb zrsB4}PV0{*N8~RXmxXB(H5rSY?QgheVNG5DcEpMwZn~9 zsg2Bl61X9O*^!kLcu0B}eX!EQXRxCB(yD0|2an$Z)m+Sc9!I7H?-O=C9F>ZdH2^Wh zJ7vg~6eR;zAuLd+Sw_HWf@rpL`nx?Umo6CIP7vB!C5=qhSwK#pLB&ABRWS}UkWnEx;m9!!blQk8Gh0aenqMcxf0axyR zYwd3HHhJus%6TKiSOLDSyEhg^g=Hbi`8DoWi+iMt?N?(1biazC0u{=`G2GpI&${fOJ+Vb94)oc;23C*e6r>c>HN zNm*r7#W2|;gffx)+to`xv|?6>r8$x^kx`S&aZ$lu3YaQO;GuE}&XPYUukD?CNCFn| zF|+A)**!EJi01?F2Wq%W0VT`Fi3ny?9aI`wo5|rCbL~B8TDhsaS25=!et(I*)jOd{ zavVN1IJctWJPN|zfLi&`{Km?%sh5m~mP@FJtX<|syI_KKCdRE-xMC)@=J?DRFYm9_ zKP&(MWGHk=42G8BbccJ14fGX;lVS1wqiTiH6Si zK8&7C#^Lrn@9Ju*9(ubAMh=E`LG~Ye_i4({kT4c833l8TzMO|FpR{)j)(1QIYd{hD z$fwMt-=pxo4$RBCcJ?e|wR=+Dy9eR2kY7|^kB`>iD=2Vt85>SgQY8M^&lTy?!WF2a zx*fFf+NDXKAM^#I`FWKin4`X4+n~#2gJ%5dYaSaT(3j?V+V^?6(kzs;pYcF+n0tcJ z5k;nb$z8gEdlSLL=gfF{ee^G1tjXsZvgNE^E!tMe2C5Gdo(C`=Md2MxE8It!^loqC zb{aJVo_=b|Ie{`!xtc{KC9^<~1yHwqzG+yfj;M&$dNl%4B>K$GS~he_ahp}h{^N&C z(Zf2mYN+YbIVy)$JvY@0Krsalo*}sAP$t}x8qC$GpBf*&zweQ1^0}BXPDyV?Bj;j|VpC%}}eT%|rT3h{4z}eUl@YEY@H+aR-X_yzD+e zY<Os*hK#A=9b!El7aj|4jK@x23udb}XzJKkVFo*tW`ok#pE#U7%v zC%M&XsX-GMGRdn#lSK?#DI4IR=D14SkV!5a&A>rX!ShJ|Ji*CVBaxp*0pG1#`2U4W z>Hl`B_6WVS*a;Kdp`Zq3cEs{+MnW-C{^5zZpxVfPQn`5PhPbn7=M1i%{->c)4CUO5 z4==F!d;&dWNIPieW(K?bPuBPF(R0L31+7f^SN{xm_C3G{#gn_lF8wpyi!ZR!i zru5HvX14)$eE%^0-mZVfQ_%bW`0}J|zy<}}n7PN^1_@x(Q(-?yGBsKWD>5X3qAnBn z4zpI#yC8YUoz|K^$fxfKz?<~hmITfRcp^6#JK8yG4BO`F0C(yp8|H#Efz)f(GCWH8 z#1FELBO!HESQjM4*fty@e{W>ja^iL3;F&tj^w8$)saHEd0GWxZ%XaGDi`K zGMlg%{V+Vojlkc$0Lc+Q9ic1<3;mesaB8Nn2h?h0ZSnZ*z>+YIdb=-PpKaZHUyhCz z-DQ8ENd5Ns^q;fWT0raQJ6w?sR7-j4Cpa06lb!RixCo+7;I#|Kob+BJlFHb+OW0t%P^Mj7L9u zBxnW==H@RvG!*1IpfeP)*CH%d{mdRS+jD{6t&c5g(kTiY_)MH-^Pb@l+`Cslc!g;O z6=9DsACJ;L~w3s}cB!26I+@+@o{zhkn@LkODS?9fQw|!q-fQ;hybG zGW_HV-MBcs-7lda!lviznDMT!>5?%!JG=P8YR{ocU?(i}SEGaA>$N zTThu-8OMo288xC?fNC7@^W^c`NKI*}nUXNW*nKkVX8=CG z2;RVJ_#ys#!-(|Gg6oaqFy5yEE$7!J9KFzXt71gdi)x}3}W1#Cv?0TFFr7I zHcPr`PlMQ+a!r6d>}M1H4@zDjm}dy{A^3adrkV4@KBKc;6PL!N(7>-vvO|A@y)ve& zH7emUiNyHyDQzUbor)&c~jSu{^lTnS$?W2jtoNp>}Jg2*cHX7Etb!FV4zCOX;0Qp7keK=*4SB%CGGCSIC3WottM&F9hl2B#gk!jIn^wSGm&_X zb5FbP7ya!`rD+2L+@wC}%AwIimLj7MDBE}g-^u8|2)4|G9I{Xf0udcX;bRRJ;r;g_7#3T_xPmVbHtkGKF!nsk`$e?d!6WI zuB7S;vu2St)ij3>bphb=wN@@-G7&G|@7wc?e8sFcgn-I32wJN%l?(zGln^=9}>pMjF0R?r^ylpWDjuFTy> zW@>zZn3^`F5QXIzc(N}|y!m-vSnom2T#Fl*!Ip>4t31a^MB;w6(VGoRd*+t8ySxy6 zd460<0!E)IyPfcqrH0e-GLWYC-`!rNt?R-#$ZIL%`$aQQQdmhY5%I z<-f*o_&?BoiHPK4`q{%isKBY*e0cZ2oT31+^WkBB6#$7Wz|Qz|^8c?7GXJKs{PvK( z7Et)WdWETqTWj6z%56%kFV~W>v^U z?#Oa*_|UFg#t`5a=O5c&h0P=J@31$?@kO~phGvb2ZhtA)tW!vwHQgs~c7h*MZ*Xu3 zW+SNXkZhh2zMl5_8RAjGdKfCoHfV?|R_0$rH^qMD2O6pu0-zq3sNZ^FM+wN&meXfm zo75|lDI@;~x~}I(`muu2PNbp!}3P~DjP9-^<1!SI8gIlGQ~Grv@k)2_=48uru= z%?1f$kQ_NO%TD3Jt21w=2 z@q3Iay(jUYow*x$C-j=h^LxyZ)U3`f5l>0UKmUoOYvZwYmPi#$kclZgllhL7VB*R%mD z{8XrL7YsH%v#%QWo@mJ2NL_K}oK99){&E8R3TSX zvHnmFxmd`1%5j`c0;%hg|Gf$s2pbDfec9B7?)z;@WSaNBt+KoAVspw&t=^j0 z3mYu6^L0?+9~(+YGb_P$Nr^*ya#zO0$SdkZAen+L_%N_UUwupz_DCw+A6InuQ)^Q;!E(lSkCo-ZqpNa4n=8G1LJ|T7bjDD&mJhb$ycp%G zGmpFmu-le9vN|(FjAe9A(~2(9v}laUWm~A{m!MbWYGWyXYp;z4K$5LsRJZy#NkVmi zJ~YP)>Y0kAw~ajO4?I}tzx;J-L?IHN)CT{YP7O=pczVyfW+{hcxkDPvbvZttAV-9+ zsjjZ7Z|)FV4imK)w^cg8M181GP0USgBp^R4!CtBAYly>Qsr^<;of%y5PyG#@&EzzD zc-V4J8z@N#a0Fr*^R8|Xr;++Pm=%a2stv~L#ufmX6c$tNhGb<7B%o*Qb-?Pne)O9N z4pp=zKq?t*C~)VQvm*BGFY6?G=7vLdTQ_7%cJe~1$9%^$egz^By+ zh#ljVBGv^$!p0fanc(bJ`Z(>52msfO1?!Ggd2$zTvT=@zW)53LLqJ$+9XZM>HShL9@}2GHBKgfBuP z?IUyj#5af9L(p@KC)C16@NK4$9l4piN++JtCVWO@^HLk5m8v?k z20n0ya+bBV0%HUO8V2fN;D&j3S4ONX`v#*bZxD(s_s?*2Su}pC7^-pQm4>j;FOUv+0mzQrN++3^jGr7q2 z$|(Gk;Sf08-ms45A0QzXg-@Fb^w9Mlhg(VdK@mqi^L8oHVLzXK-n&o*+Gj`KR@I^5 zKc(ZiTUs-RIiksyfzfxWTWj^mik^(^2pkMM3?Ryt9iG4;m`ccGru&4rX6~J@lIkb# zcm2vfaVTL+#0c~BiSnhEP_PVOG%GfPV7ob0c=Ott+jox#?!sP{Ou`&aYQbCiz*PT!txTlDsST(d$0O z-VOYoc-*X_4WbqLJew z@rTDjh0PVlUbt!>H#_is$=ci*rSXDE^P?k?f20k7COQ@9lpA04+o}-B?A7YSyI|o& z4YLGc5!qwKN)-#iW%z(ZVGMUg@a^=NS2^rybXfpR91AjAwdk6BR^V_xTEg*U*;Nnf z63_I4W;X~#{c1~FT$CeiLoapQkPHe%F81V8A5NNIAc;68otzLC6Z+$eyyM{nq34Hc zeay!D)UF-oU6^sQoVW;L+uVO=9SQ@R9C5q$*1i$3(#=(qfvl9qC}Fb}g)$}?O!qjb z74cO+`=Si-3~*Ep%I2Vb7?T6~rzbSYg<({jMaJt{n*Mo?%(3@w>hU06CCY}(=*Z`0 z198t|vuR5TBv2RZKs_}{3SW3HcfDHB{lzrF+h^_tCG-8tFnrnQsyb7~$ivM6OrI?W z;_M1|xOjw|R`{HA?$r#u+bEMcQ^gpbV|@M`-{?JDcDp}l2jJZ9(NpU085DQMd?QNq zlHYtsewBVKA*H;(50IVY8ThDGQOv4;34rT92d256{>cwX?hF)xMya_Kar+fYYIrfd zaZC8&2Nm0+vB3gw4!ko1&SSjcj94O36UkB{Z4-s6A4$d{j!X{(8gu8k<$27mPUIt? zTC_9jE^Tw&g|tWdxekhp#^D{5Ob&J1W)nJf^qEs;*KSTw8aV@IPDpAG zT-tRY&TA@bYHuDXU9Q(4up(hLloS>C@*bCG4+>UE6xxPxLJ3ov*kk(;oW;RDA zK2*#}$feHDM3j{*EzoXh`=Rs7N&iGOo6%2j07hTulm?fPsfU7Gta5%z(ok}4Wbnia zDGENXu6aLD9Ci{%*sCag<8=Mm{n~!P^1Bo+R{0R7=Zwfl0U36~HBG!(D-YuJNK(?N zTAXsz5CeQU>}^s8VlT(3=RybLYZbYrhWxzZ=LZ?6SJAJxx2_5637WR}L53_x+%bmu$9Y=lJf-}y zsRh(gO>fJnnZF%TILmSA9nJEtx|xebubMUXP}JjY=L?Su%8p%pDvT=)*pqEq;TPD9 zm(ll1pt$?=$@SuKs2%ZyRJsI=HrKThD*>w#HrR5;s2VZ&sc}g9(O;#sYN{TZPs2j}E)8 z>>tsXH8)@QJLkw*oY_>edDSgYCSu(dt!;Yc${>{vAqcBh3kukv z8>CT-ke^*fE26!GbYf5f8K~giqU@@mbH^-c7OP#BYEr;q+jbrM^4o)p75!4GnHg2s zHKslBb~eTZ4I3vBc}U9nFnmKe{}tp}oF=yKRX7F9WNWpPsp#fh*rkQyUcxA1=RoK>wRd`QNt(tEDG zdRv3}^M#Q8)9l5j_UfftP+hM4&a#;ZIdT+Q$4<4o`A*mJ{3}CtoT3%lIp@H&MSYTt z$Udp6n!H8mOmXEvNg6da^2X>TH7c|Ad0l$j`{(2gSgy2e5v{?zeUo5aaP~-$9(C+7 zRoFJ*)$%8T&!`rix+^NI4T^1SqsE9|mRq)*GauO|@a5)LskRTXQbE zuL&(L%&&^C$KN^=d9N4srF6KRBoz>{O3vpFh=!$92mP9=ql+UyyKYA7(p*?sLQ>~^ ztb3aC9B=(5M1OE&5Dx3-k% z2KzfP^l?NTME5VGKruY4`tc2VB&Pq@cF|UoNg1ll5FOkG- z4JT^dHc+dSv(sh_*)8;ikb6Cr>wv(4=$20oYa_z@85Q6e3aD=BONhS*f||HbYV~~< zEevkV=#QP2vTNK#p0u?>@114X3nmypn_RY^aZx`MC>2D^Q=1JsIz8m?wxVu>IUM+n zR#Vm;3Prgfbb`d@*E01p%`sn{5S#&cxaL&)`zLn3PIGc09^x=f9Q6&riw>2*T3;02 zfCoJvg*|Q-$s-~nPk)$~cT6Z8!@sOvev{m*mtZ)+MYS}#Iqc=U_~3KRnm&{Ud+%vV z6R(mML64TET|4x+vy>#YFUWmP?pWP8_X?F%x9n?xzX{cFu|j$TyTI9EhDzsV!)RaT zDgqMwt?4+8G|EHjCD;@+frOC(`SghvD6<56?v-=*t==*S~@uXRxX>8LOYr zZVdh9`Qe(G5M;M11Gt0d`H56Pgj@)*Q5hbd5v=35QrN(Cu+r!w*tck_p7%bi;Mn^q zxm9wh)=JG3{48W~B9xpF{FZGu8B{I3k2yAU)nf)ZEQM}_M?+&L>W8{ygl#=yt2$K& zC386e%;1!L#LT6L*T<^i?%g+aS18kEY43S*N#K`@K=Wk9iPe$dx{=V(ff*-V#{HU` zv(89bmv&u+zBJn*>*9jqiol&E;BujO@^|KZ<-O2VD&DH;eh%q8+9)F=46X;K8<-=T zvbSS{f3+)QZ7TuQ=BfvudMq*EKAx`y6m-l3^yccnmshTj2exoaku$e9ZIfjU-DW=6uAeWT4$slHzpq*4FjD9 zDA*O1wd*h650&icZiI5hoj3Z*J?7zTO)Yi2@ELD z)j0iE_fv#1xzGb-Ruy>@4kEq;S)zO5V_PURBO`Wjc z3wXd7PcvJ9gY+lWdmV~WgJ|*k$Wq|m2T)>E=2ke0s3gJ`RIo2v0}n{Eym8yn?AXka zw_jwr`p6}sOw0U4)py?o21@R~LvhbB%Ur?Z&P3tyI0#U$%vMYhhZV*k$IIu4Ra0V} zk6~O!FIkMrB-Q$p&PnTpe-D~K{|^?%QTU)fEjzFYWX3E3qJHetO;UPAINGJXv_E%g z3$$Xq+{fj%4qC2aLvd7f!#k}}^RbT#>%=iH4kHnv)0Nj;8IR9Zbfk=FnY$F9-^=htcp!0J`wCaqz#_#=eemGntT=H-it$ zqMHCsw4)7x+vo9@loBmLn0I^E8PWD5SW50r&TpSo}Kue&z?`vGt(_5bbFt1Ba{SS}5ku<4Coi|HM@Cg^VKQ@*vR_s+h66i;;Daf75?vJiKq4-(9{4{~>@LJ*@3Lmv?A*_?FPd z{0w`Y@VPLYFj=1Ky`NPNr5@rC|2QkSb9{J;gWJ!Vz;D^K<83yvdcM4K%Jl02K63Ol zdQz&)e)y}b{IICRrYAPa7}Px4NbOzZAAX0q;CnZ5yM@nOzKE6?^&h`ML(P?om7nC> zrd!+%xd6YB1R8~{S$>uRx&F?BD;u%-@{WfhhZE+GUZ`@JFZT_iacJdY{U&_l4fe zQ$F7H=2@1uX?FA_DfWUT(NX0mKRa$PW=kOFbAQLO0G)#~)c`GI zk@8&Rm9_b{ex>8n9O*KBweQ4nL(6n?VWH;fLEfL!o*R7gi(1;EEs3~R)63`AP5M$B zGSlPaYq1;{0zcw+Z*%W`mjEx-qM}1BHErr3oy5LvQO0~5Ju>t)jJUnMbr%&av_a;2 zjsJejc34r7FpZ-Fl6<1~X;1e;=d|+4N-i*wGT%hUGqa+b2?g$TMDTN>6cHy;^tt>( z*7);2>x>RkG^zdfqk^`1L6BsO^IrB4gMA#Ry#SZAzg=Ov^U*>ouKTUC@MsnntNc@O zJg~?s0jdd1>icOsfw3zdNtmc~POt_jjel#$&t+|YcVqdmatvd&P%1v5@UsoL$oEzf zMyOWW^U&IF7gYJA_w*|t7mk$Rd?4q0lO2hlCb$#*eW6ayVgD` zC+IA7P{n=@swL7XR;pjX-Uq!LLzN$g2~}U1%=Q;SZpLbATC_oq|cJm$uV& zk*AZ0ecPnI-Nyfhd+$`dzFJCghi@8*%L;ZMxO4x`<|{S+=1(vWG(b8J^$15>OwS}? z@(cKtnVPk|l2GG1f-9d?zq;efhRZ%fJ}&YjPu6DHy7pWCE&dL6lTuuoeb~{gAf3X~ zcWSMDhAz0sdjPh1SMM^oD6ZrQCSRSuy)@;mUC4;Y+f22cfv?1yexC|c2PGx+y{>7g zd6tAZ-#mSf#os(I+nZHath`aY1!#E`lytxE^=~aT*OD+TwbS>I>sm%gd1-UppH*+Y zvj6F?5_UxzAor%}LBIlUwU{PrfcA^>nbVbhS=&nq8-RTt&T9Pg^qt=?`?S8Dc5-NG zYXR2r1k)1Lw{79u{p4@gr?i|$aDhczFYFe|wFbUU=}UU3cUib7E>IJ6wVHn!hdo(w zQELNYKt*wOEp6vpY7RWXe7P1-%i_;6iwbElFNqC5U#;=aZ85#A3DP_&sDtS{=j9^b z63&OcN28c;(JV8Ecj?TJK2ZWG?|z>ee1h>5FY;|`sR2x&@M4Rpn<*a#T+RbJ2!FX` z(4)N3bEi!rzQV#MUto#RB=ied~A7#dXuS zei1-F{s~?+yoDYr+?szEXjnh};*`tygO-}@*8|KB6p8(HtNgE(TpPhXvi|84U=F>l z(=VJ{#)W}~R|Cv6fQI}dJ}|$@vRXYB{;|H%8Hu#{=^bEhKt@K2gIXTl-Q1g*gi-#v$QRR6^Qy&E0@$zz@buN+ zWr;JEQf9a6n%+XAo(8=FW~u*u$^vK>-S=8#?25LFeAL_Nd++(J>Ef+TinAvURHuka zoJLmm)6X52u)|*qDEy(4^ItsFOa*=<*4|$Joj1YqaWtEeeBfSUSNL%}0-c37otTi8RTl?+L3_c=cF)1XF z|591N3BZv9MRD3kLA_T*+5quL0dCG;%=)E|fj3d!*pjrlx1hzezkWLPjdZFK&_f+` z^$DL*T_7=HxuQ|I97O5Xqk}NZu8yVpkBc=1?sS0I850abR zsJs2LkA1`Ri=37=^_H5~M?vmNMPiru9PM~n!}jNs_CX)CX}4yE@V>yxcLD3r1c`lX zu303Oz~3C+t-~4No*d|sB}r`9oBvrKFtC}Hwx4|S)qxcO-VVF~19%4Nn-%z3g=`S$ zZ`$}q`m&GAwSbSCUBzc`VH_&&%YRG`TMs}`)fO%?ib2!ZZR#dpMIq-D5VaX z*jW^p%cs0Mvk_9iso#43Zli$x1%saKM#z*?O-cLo3y+qzy}mv!F7o!*13vomMY<0L zw>G`2Fr7$dAeLu4_gMSPXo4mV6!`+uPU<@+nS}96;$NE8Gz7R?%|+5bU?cFTLil`Rr>t)}{|)6?pLuE2hs zfj<0vcjzMEM%3!i+)OA(W#g-pjgmm~^0w*e35Im){l0U(z=HmM!UHIT_!$E7D+lB! ztY7{4M!@fVeRMrtp&~IrM(6k-tvs!oCd-vA&*Fgdw;oLwQ6nT{@?R_z71jCrNB}fz zf;Igjj?ZnLN@Z;pSp6ipI_OyTt?{$Y?~Py0mB;1sMbl14j|47Hi0hXEi_u=dqnUUi$(}sl@4a5NPMMbT=NSD; zULWe&5w-$%Xp{vx2uxYv{D%TSSVc*lFsMQ5?Is1GI7Emwv8}Z&rnLqLlKI+spoxuq zvBnKgP&cod(()QSTBTxJde3p?v7_~;`l41B)CBS&1%z`drm9BbuqoFKSGLR8_#6YJn1lTd@h-06V?3EY!S-T``NLN+ ztb`^@tnAtNhN*xpmxsWReG5OySOXo$uNNbjtt3_}#f*rG#P|8%2ncXm(>)!Syp-QR zmog;;C}*Ohljy)?FeJ8qJZkeDb^Ss>gl&y(v_*OJynN{OCLbW6iP%FMI;ZuIfj}B- zYGeA{2>5u8UvH3Gn-0%HS@jWOp(iiLrg&EelIxfaviK-|3qp64#4BPMyh654so`|toz6h9^u8Au2SoMR+@cA|dTb7gIDt$qOdtdr)xTcHiJrcd`&7@Twa)}d1zwmbxPB#gv> z3diTjb)m*~X^5uh^rE8CHNVi}iSq?tbes7_=aRC(abNmofhd7ZDh@o`@J^;o7*GS- zKe*o>)j^HuY)K>lhs}mluC7sQzWYQJ*t8t~ro#zoW;40_XO*O%n9%Ka1q7RQQUEvc z<%q+Ex|n@^lr^9jtD7?IKg)-4{b|(#zi@FH28$m~lA)|{?guWq$pXF4T41*ujVAy! zpq(4+X-^tY0DY!7X4g{M3Sl@Om4o{MpiT~6D5!`l9P;nAC)4xLA?P|V2hGxP&Y~{e z!%D2FK*xmU*Kt~s-i#l7&7edHuI&nz{nxJA0?fcSMSzV-@onrf@|N78YU8$z$6s^C zyg7#}$ByOg>DP_rR?~ZI1@TW0c5l1`$B7jC0eIDNJ(=RJseqk#r%!uMtS#69c!m;n z&9C1ATIX3g`{E`aTwkF9cyvo=yL$X+n3UI6C`}AnPk0Z;ZM+uZjr39U~@`tc7 zx9i6rZoZS8pbFSw;1O(jc6U)xI?ozhgB7sa;eaVd^Fh6GHR0|~oy*g+FOeFUmmB;AH5nbsR-1wf43F*WETtNAP@^FBye$u z!)R7^?$6^G`0$6TG!l-phz}RZ-q}80L1NE-PVEz>tvI1qH>ubG{~HJg>MF!@ zmbQRC;7QF7V3s78;;76m`wuBy%T^PpoNrPJJ=y0b3Z6U8NOxqWk8Yq3ASewo;Y$w9cOWBDRK~;vVd?gFI2V|{-u%BHI3{q@u(`Rnz>09!x0cG zq)2@9DB$(LOV0kjulNlF{JPFZ=o;5yQcv%)wExRC+ROp2vs4%mP|wrlZOwfgdr=vf=;vnbU`+2u^w zO-IOiT5$pJ{lr0jZ}PlVSC3lKY40U+=yRP=c4xEHpe2ediAp23v&zpknEQSGkj826 zhKY%_bG-qN+k(F)*D!90MERpOJiX7e7f7VMjo$ZZOJO0C<|h}ZJclQYCfIIiRK#kh z00w=Z-=IrxZ1~Kejf9^3bfniFJ~ST|U_e~*AWG~DS?s9Ey!6Tve3~|nqFbT;zxt{3 z9I=B4*1m*vuon30{?}e3E`7J2D23eRE}>^n2p&}o_6z^#4V_vS!L6}B&Y4TNxD(!Z zWy6&bHo)R&^JfAK@R)%9PeeR`&=-*gzqI2~{fdbWK@*=XJn!Au@N}WVz`c(r*hC6% zvV=7@5v7oD(i>4H6*lGosPrQ6SaZT!=vcQsz|wz3?aA4ECwd2Us;C_TdR@TAE?45Y zaQ{^IDB#*F3`}HBWY;&xN~M31#97hc!z-KbB&hJvKmqGEI_=rV1FP2vvx=Um(_Ole z_*R~|r16_ewv7OT7K`@WaAi~F;zwh#=BI=--S@x=&JiqWmBi(w;WFo|m#Y9Zyi9x~UPeR8W(7*w&jS-axtq<}%wYBYMZfp)B&}_%FZUtRw`2Ylo#JmxjGF}#i46l>%0W`Jt8Rf@xkZwHim@+ebk z9#h)J`)x@1S1}x;{=-C1f5|>)9*g3qa1Fb3&Qjp)`t9*00Sj+EXDg_{tM!O;LMNqY z>VP9aWvV&!JWDy%LTyX!+>J?otk+Ap}s0 zf0;pb)*f-8boYAuF-s_L&z2CjKo&(KR`LO#a>M=ZX!oySbAgyPD5<+p>n-xu9{MRO&VM7~s9!MHn}pHJpiN=W*$u#am9&?In0X^)G)~uxx#59f^lBpU39* zmka}zi>m0c_4$_pW^KL$M6Chcq@v5yELJ<0HUAblNbm-pl*2Zdi|4hw<1wX36pi_I zs+Cp$9iE(d@Z|6fp0s)Nx6SHUC}2paF;|q!3i0~1O`Z$@UF#9ij=uTMl>w*O+3MBV zx#Zr;!XV)`dTXxkrWb*G1@^3dOOveO2_`Uu>%@%OQ2*%+E107mvWWI z@bkMU54R0>lxNGQa%<4X5}zZokW;p#y)4NEmkm$x- z763}{_y!9{fv?jCjz{r438;LY%1ObfB=bVocwX{vz|-GP7zQr_%6j(!irK=C3{R8= zl=n4sUMm#%(D@ADmzvN9v5QkKu2H=7r)?$M-{nJ3on22jXbHJ@C%U&W)463+wm>Qvzt^_f=g`#$J1LUflo&ja1?i1{rKa}cd!yFgiA!jB2C!_FbHq6Or>o6Cyn9{% zJG|yG`oj)ybxR631dcwgNsBk46(cAMO&gEH{spUc(q)6Uo3rexK#a%jl?Dw1theKc z7SF8K$EIq5xe-||!skf38CV6&sN`8kA9rzt{gWwx!~>B8(eTUD<(?QZ*L@iXq4fdOQe}-L z0}upUlL5r}7c2aZVSjJB)Yxv*UQB~GazrYTYfCGp{Yxv>-%#lKhAU6M^#-IUY43Lt zU84zCt!7Ya4&`0|kM@4&Ek&3!@G7g*|1}bz-xyiwFiBQ-6tKMn1~RgJo#6V-Zk3y648Gd@Xo{hDEWzU=3I%u+(l;1;`r2{YS=OLMsjg#B9QE`~?l$;|QQILhu$L0cZ2?(y0JV zd?(d>2}G%F74S9)1EBu``M**X$5X46C^eD4w^RK&@OwJ&Eqx@f<=b9>g||1-!}@+q zM0qJzeMo2{7Y`_;;UXIHH@9!sYUn&IX)iv!fL(TKZ;*8S=0ivugTIoDQownAi-Xoz zUh+Qy6oCMpW85!Rj|V(oH*4ICFJS-%HCMhriGTl?AoTKLB%W_HTb2X<@Kx=9=k0`B zJ9R9O9lm|yuqZi5EV9Pep}QhCnvjvAmmrSLgzHtQgL4 zsmQVOBM(7i-GLl%wnFW7N=D%Na?)A>LB0j3MgG7y@_>@gvgkf|_i^Ciji~()G-cu4 z#^ZnOB?Blp#8DJtd;VS&6j;=mb&KLjHaLtx2C8U=e-ClwTe7=WX93C&CP4k?TlwKC z3b4CqKD&D=LpftKc%^{IzO8_j2xU*p7bAcGp9*a-*C{~-lp@xuoGU;@T$}Im?45u= zq!elPw}FZb4dZ)jL6P%kKCQ`1r~mZZ{Z(A z04hhQ2upNC1^Ni5UjRQ=gLqRG8aKZA<6HR&|E&Tj@vQ=w3V8pz3LsFxiin`5MoAq7 zs%QHe%vUH2hk3$#8D!e=Pcf}C;(IYInFo;kJ6L)N)M-(*BM)M!sn3De*%Yo7IzP3e zPBinXNGoPwe`DAGr@ilfYcl!z29>p-po@YUFuE4}t`wC{D3%2maY5<5O9Z6}B(xBe zso zoHOV1nKPd7)|C_<27F%#lQik;KtQpf6X8u;(kiz}A$!hGS6{mdMlYSC%m@Xe->B#; zND5y5Az|#Ptpc;x6DE15zy!3fo&a4igY2TJ%41SAozVn?uDsmrYT0IUTgu$QZxUI_ ziK^M@+yx_5!DM0-a|BM>6#<0mDG;g@pMAvxoQVh7UOUP!NrTyS^OU&*Am2N)D&L!8 z(*8ByTM`KxBL3CTfkHS}_VM;kmzNV&Zb*6eAQyoiK%32xAT5mm7#=`6n3LT<1}6_3 z1-+lq!s%WcZ=&58#GHF~4XlK~NFLZ^1kF@Us$2#!88cdg9M0I%y_Wxw;caCu#Ymr9r5!1(pvzNDm5?LuoX$&O16|??U&e`*pVpgGlVY{ zB#4e<%7U*V5SN4!D+6th@0|Jh?@|V01XL-V=%+`3e{IG>L6orxD}JzEx=f$2$NG&Q@sMizL5$( zDyZWBtrFIW@a8S|CI)i@DkK7C)*4gh_US3Vnhe?`C)jMdz9JI8!ZAkyA`&EYvQ7Gi zD{2mUOTw8_2T~!hcfpL%tvQk;oQa~y^M~$&4nG!vV8UH}j4D8KKzaK=AKpMGse|&M z0M?QgrTmhL%xUTFMQ3m%Cn zGCw207H!@yeuv(`R$oSlaxFhvnMeUpH<%mb7%|YYx``WNA!r5LJNOO0WTbW)f{IP= zwvc{1v212f*LzYxq3Y@JsB?7$zIKqnXs*Lzi&x;r3``KH{Q3A@lm0oO7&R=!MVQDO zLh|0vH`yVnso`{O%*pt#$fxY*FM*hEN<32*iQixnxT0!p+n< zfB_eE?FxBW}i*3w>wBV8}sf491LOu8IKo>eLc|-a`1;9dR89ruQ2-ZVDC> zIaUTKx{WW<#E&b#kvhp{r=$kbwSDTYU(V?>vd+M492;%}XfH57;Npny?pOe8AATCf z%5%(lXGJP9$wy+WQc)k?{6AV9;<;J?K-qL)Lq2|~v|n*usX-YG3Z`PM5OeP*PRTaP zUJ-)%CN`@#O@o%s>og;mL->9tBanL(s@a+;sWf(FIYS@t5#lp|YwFXwrq!F#XJns= zIl8*OU^PeuCP0h>(zh}&VF=VINK;EdgpM^%`-R9mKZ3QOH&5C%?)wt2V#as3I#Vu zA(#zB-J}mBq9HEtn_7G#mI;Echxd$U+4%njh*Ka#xoK<|5RFQK#qnA-Gbbz**EAs~ zn;i|BlAp1WYXr0uLQA0|MnDNsz}_~1BTU{1sUF~|Aw&JD4i8-V|ESHOQ69styt-?37M{*RbapHMbNJp-zaK+S>>p2ar=YGl^j z0*}Zdv=o4Ex(R701|Eo_+gh4Pz@K>y?DHyJ7YTlpD8i6KG5r3}*?d+Cn$BSCg6698 zfZ|pk0-be35bIylM@YzMjB+Id|9T7#l=$OUVG8_x3sqzvtY zQsPCEZUT-Jc3ROfY1N{5j_)v8_E(1XK#8Ephav}5Zvk_1oxqL(Fe1h#k3d)r=~#wTegoJ8P%PS?hY3dvv|Dw&u-4+-DH z*xT(Mh*O|O;&)q7r%H)SGZ0a4e4LCH7l8mu6)-63iG5mdA$hD%!BbxKyHNvbc&_y~ z5lA}$y-%T19Wbc?HBP_@gA1NaGJfC)>K;;D)63GEvmjU0CiDODtSCStyHhIcGdnfqp8jko$z}J0ZDG=sUie$fJn?KtJh5%jgVC zZh!|Bzg60SMwmJvgH=%p^f%93p$)E9Xzabx4O{tyy0GS!vEIfnZ^9otS~#I*6tUdPR!$P@sZxPu%3uSdGolRu zuTlmpHTxa;BSG!KsW$9x$8`dvV}O5sh>bkNL*0mP)MPL&)Yxn3`BJDY1hlI{rKc;R zGe*eXbj!e~g1vxHqd>?iT=H#=l?G8V&Jo9!zr^cL;css>Y;G}#h-hlLk<7ntE2LZk zfa1{6^@(nXmZvnAnEO7c%@0O?cQ==z8G`7>r}p3{%j1fi?q%>iyTU z3Tb_B?AlCvZN6z1SP)iVLHL0_REj57-jcz;(%?VSk7?P z6fc5+No$@`h|;oA#opcy1|p6e^Rdo&e&(C#u}=UG6lCa2er`0i`zFX zg)U~WAm|P%BEXG`7SMeGT@#s!5j(zKCa$l~(Nq48fXMdkC`IpKu++~DY)GQ(CE;ol zAU8DaJ}@y|*ofPW|ri0XiviU0)dqk;Y-fB_8@!}gn<1a(NGlDBJc z@?)dAap$SEy;QQiwDmIAa;cl;+Xa}{)#HHDtpXE=ODG)>aIj4dr>?|X_k_}fC<5Dz zmhoQxxLso}m9Rf?Kd+WTSLE>w$v)mxbUk2U10tD;Ck?RHWzqys@^`+IG4w>!}K4AyqEGlM0KMi2S)9|t^hH$-`KUzJN+eJot? zv~l#idlJ0I9JOulAzxJj@(5weYqklY^5iaZ`%b<|GxhM9WmFT{c$wQoHeW_HlkJ(l zT@y$^w-i&Tlx~@FZ~*-~zTUoG@FcGwvC8)Vs(WnIgyXY-)5mKQkNMgV@U{V`Re0?$ zx0X~NRA8Q+7m`* z;-ZYfq=2<5Gk9Iz^i4=MgKjl2>_Kf>!iD8}E$dn=b*18=5S}8uzAVHOUIo}Wfb#!l zO5ytn0}8Y|Suk^*-7=z2Z8s#u-!^@P&h7hB!63UyJIGw`vy}#bV_W@)K}$DgC;dN`&+O#8`1%H~9v512uhf3m)E~w{(T?XFqV_BA3FIrsG5O z2}@Yc?3i~W8TJEQ`q)Tr%!QMXBcJh~r4#q^NAL1USZqrIwa!2DhBlbju>jNO2EkJ*$(rYKg*D&ZV z1Ai7tucM4G<`WM%kwa<=`@}nBT{#|fb>0`HBU8UHy z_XgM48Y(lndcK!h@R!|GOgfk|hXmcc3%VTx6&wLj+Y+;h$k>KrX5!f`P3WK%CdYonMy3n5Nog(PT%2}x< z{ZEigHsvH(Zr0rzVea|KBi_`?+P&v}l3KP*aQ>+?R<0?1^x#`BJzp(z2$5y=&Wrj; z>yE(x(lTz(e!pmuyg0YW9OOkIyOScA* zcDH-c2MexD#a^-7A|zsF>{{kozI zeIuxS)=s*y)^*9N#&#~vxFNezk_7T zXK!9A8r)6f^tQWBTt-jb;&o(kXUvx>V4kfo;s!W>{F+XptggI-uOvMwojD`kuuxFE zDSaU1uq=-GxBmphbc@UYrA$;8I4N51T8;Q=TF$m1PTCW#dBTv6CO6ty!m|Ai+OQ2j zmB-7zqOYmx0z!eXwtDIO?2pWO&p|)OtxEU+=Mazi*q*gL@hU?v4`wl{PPi=g zYD;q{3N;J;FVBvivJ~lQ7xzSP}nQ12H8{+_CZ&?>jP45J!=s? zq{Y2V6>XOKF{1Vmuer6@P4p*fs~4Hd4s9dmONW}mbh8-ta0S}Dz61XkPN{@r&Ad_W zGg6x%t6ICdy-rY0@ z<@6kKc>50h;O&y=)?KbkWVKLcu8S z$BX(q_-RPkOJslKcu-j7r?fM(k?6XFP~{WvP64rgI$xQupGe&39C^k6-P7|r7j_1g z#1ix?QX}VP{loZ`*oJhzm;Z3}f3mI2bxy7T8DHx}9JH^wRrT^u|KZQqN~i-B3*=1v zs2nY3QZi@52uZ#%+Xt>Vy=OwvZwt*K9PNWVTqYJJpBlF|2L>;J~5%M3rRvsjTHqY_3#5}l!rL(iaZdy}W6M(TDj zj+e(2qG8PvOs8o_7H5VXRx$TnyqG=+jix?|{+OPEn?0`?z;1P&MB8ak=kdp9OXeO< z?8%#H9chY9_(b3qv@~$hp`n@uX!MnheklH^Ye&TgR-TeHI9x1|H`XxfN{&Gp72exs z-kx>MofVHM(?WeH*BzXKRcFu5Wn1CE^h{TM6d$SfJVx(Dz}AVm=g5IP zdhOn+q$jB0{)d}4J$o1$iqc#v9lGcs*Ad8M=HZDtoI<%HT*P6|#6;WS(TIKvp zlGvYR7b3$nxhQx{Va}`Hukd?!8RKHZT0fLyQP>3aUv-IxtV)Bmkcr2&q z@_4mg=%nRH{t==rXXY_7TS_Sv=YNBYDV=w3G?@}lXPH*)AHaWDGge?4hoMQRm+DuR zia(#0k&r%ViqnyOcJ_t$1*(3I6j5t>iZ`lEHf>Fx^XTn@MNFb6RIs!c z-P}D5+@7o<{VT0YVXVMcx*v%WxwRKtJ41iZ?r*<-IbK#gTu(jY7&-){sXFG!De!Ev zP|2l{mKsyu_x0bew-3X5gp3`Kjwz@bfbpl%Rm!sLF=7dq%b=M^S`T6u{&5e+Og4AT z`iiaZSXP1P!hnyf5;LoU-=XV9K#yEyXP^)Et?FhD86H_+nQK@ zGZ%p!?pIVEf)+uy&l-y-^~A5;FCGS_^o|Oxm+U*8<9jKNw24W=MqU&}cqye$y_|^G zh!>}f>y^tpPW6Bl8~TAc)_*YIJ<_9yiA3%0n8UytI{zpvYJE+Pk&%p<<{FF^x^~#f z5Gg9_?JqI2cA3>2d!}7`FOW3(c7pkF$Npk+8u<@Zvt^6?xf@T2Jrg)_kQ{n`G@_5X*_S#y zNjTHMi$; zHZySeHsh65M+pO4YZp~GHk)zoSag2tvm#^WSd3+0AwRD;q?Y97!=am$s_Iw76wK_B zJrSN6tyo(98Az~vx3vyN^C-K?%ujD9WUtO_mYk7EgqkpqxeSua0&O_w^^oN9UR$}i1%ydB#P$mC zgwvlf*0I=M5(`N3=~nGAd7i{l70un4MiO3mLk~H0uk6ltUDG1FQd!HIdY7PWDr)$6 z8ikGDn73$T3AUoJAg`lJN^*L0`ep>Kk00|HN|w#e0{5U64EdLGIOn*W zf(r}_%uG@VH?1k6X6}~%l>wjeOjTp+P-b1Hk8IUY0X>mKgE3K}SG2$7d;~czj{=S3 zU5V>R<8jh_E1v`_NIP7TPJEObX3guIS^5C(#QQ{8=!P_y#woK{6?|a1>~E);z}Zb* zO$NgU&3BlhYv$?Lc!X}RyE`5NA`mCF{Vhr&CTf8xarZlt=RBF4opg=Qz)@=HaB|^b zCyr_DImiK5jOfbRIGs;9)aKFO0{oXzbJyFMyULXpj|LsObD)Li<=lI$*I*o`Z#@kv zxixp?_c%Yw*}glA_C@D@c}Tuz-6cx#d4s7S5sF8o)=ImZ6uhzOoKvEg92R2$MhFI&!POVRFAlR#}kN ziv8G^8l#I2MQf^?l~{(VwP2F>jq*bjo>B0OvkDlCEcYbRMfK zzo%bDqEIc$X+Wd*pjBaIel65{0I9mooC{w{G^l;QfsQ*ERr4DY3ceUHAW#q}_ll z_Rpi$m3?eYAL^CNOz@m{ZwU)mckRI2pu_1w*0HS}sL6bT z1X54&({egrTrbqJH zhGhR!kwc%BKB;X0+*a`=^sH)Vx0E2M21x7vaU5VEE8L_G~#e ze9q6~0R@cEfUW(R#6Ysx0rv zFBX4v!T+RX80+c;u499YLugLL+Jze_M`OAgX&23TPi+M4_DIubhh3P!8ryPa^rN{;9>i`> z0>t#=?uDjp8mAdSCVr8F=xMF!p7`82tkUvTO;wI)j;8(Pxp7W-Sh2f#PG#7Oh252` zFyAu@+5hk+hw3L^@j~aFpw6&a&ei-yT}rlG^wOoH&QkKQg*u}D;w-A_;43CK9TdFY zOs)$al&O3A9U%aFC>yOFZ=d_e(@FUSB`*F8Jrqs|5Q~CjSc*;;>RF?=i^ZH~xEJiX6S* zaH(*X%~#Xh`e1(w6q+3^Qumz-AOJ(^{?O;5ixsPRbN%SVgrI%<10_Xven*u7(LyBS zb1JX(%RdDPe_97}@X*bMB70Y_ToIeD_{iLH_Jq0VZ=GfHV?($h^_>YKb(*Uo7f?X* zg!#`OzOnLwIDX3Gr#k#phgJ8<{#1vb>hMz?fJgr68~#^?3Sans9Vvf_L|xmNFs&c_ zG79F-K6U**MdZ>h)6VyuvUZ-}1`Y7{?s)d&M^Cg$+rC~^eo@EhjB((l%&!X58uE8a z+St3})Rn8y(-;Tj6x$zOB&=L8uShl0w!3$_H3{5%1^(;^w}VPm;Db( CzhF}U literal 127488 zcmeFa2RNKtyEi^U)F4G~lOQ2FA)-zY1Y7jpqZ35$CLxF(Jp>cI*XX_XUK73dHtJxE z|Fiem*=O(X`u^8<-uJxkd%p7>T#vPkXO^|@x!1k!^1JWn>ig9^=(eyq5zKoE23WMVL|pH~ldVRAg?Gi*UJ7_*uSr95_Ah4 z4fycT?|>kngLuZ2QZ8J^lz0EK>xBJ1_z-*rs(@bR>I&$#?bD$QuRtezSD-R;d~0`V zA_Wl+6~Z|70G(>QGh?=eiNd`1MA#O5)fZnI}{rOer3o#OVHHXX^YkcJ2cz zCdTC}5KQw}58ve#D^B}p=$>Zd4Bf3ZGr~g~>&)-=vb0*oD}Cvk%r50t1`ceWjrBRo z4}?UQghIq@t!zyNMC>t&era&ylg}k=oT%->F0HAk9BrP?!}x4d?rRo6_5j{K>vnX! z5xEMv%d$%g*Pf8W!*dI>Zcd_^e^No5p4ck#Bk68$o*PpZpEWAVd+mX^lNt?1;4EX| zoK87~VE7)X-;;zB`M54fn4!kxnjfjm2&m$U zo2*E>TbkMweigZG=^byMV3^P`R~}Dd-*-S>s0g{Y*qIGZ&9pcY@4{?RIjaxe$t6>V zA>@xRb`1cE-eOD%1pQ|(=I`|w&|WBN1|WLA%Z@u~W7t{P6=S>o{GA#+$QH-&vJojr(*LE9;174`XVBLirnukr;** zv+v3mbo#?ln4)W8LQ~T~x|5pT-QzDk z0{ix_Kp~Fo1nPC^2_#{Zokow0UfVsV{%qT3Y{_5NUX>%0rNd&$UD+9YI~7F0C71qK zqYjIj-Od--GSs{+duP<8SD*_(pnc_$$#ZGYs~F!~2vSa@3tfF7PBxEupCH;db@SVL zd>X%RiNv9JV($HiuSBTk!!!$>?M`FA&AcQM9Oe!t{Yt;QbeJ*QT)yT`lkJ)7!8-n^ zp~`hx>x2BjcjIT57O}z6Y5Ntvs~_)KvpU`7$7){obb*8yxXtNRm+v2se++N7Dlob$ zKrGNiwr(G`K%LVbzE~&rvc#QO#%F;29SR!N}7wY~v3tv!c#R!E=9hre$e%B0*TdgfF90 zJ;uEw-0iZUzHP|aDu4Usvx^@LBjt}})Sz;WbeW3|{8fKWHYga{oL({@ zw}Xz1ui6T6+FW~HI+XNGye>EHP89WW!ZBZm{iOedb=v;6=7Ui1H%ZoR)gKg7d95P;lU2n3s2TDnRq|JL@?UY)12b{t*Si6hbuA38W#P3fyq1O6vhe>E zS(s8VB;MT7rmS+tZ6(UG=aVT@6ewpZc1rpvnliw6&>q~vPzLuJr_?&VqnyjQ?7hs%M1QPl;_+* z>d$sc31xihe?7)EJ^n}YhYhf|)=x&SKyjW6$5d*_i`ePzQpHM6J}IJHg0I08)(cyj zx@%8l1voAL;%;7D*wXFTr}DW1;b1~Gel-LK@$A2^rv&2ubg?^YP-m6vvD_uL8mYGOTW({Cdl6%WV=%e6HN+;_WT|-!m{Ld5dP;~fcJ&}BKC}$5J@k8 z1scFYw*C9c2NWbn8J&<~mqxwNM9&^%=>K+oshdOsUJpW14_AdM&BH9fm45FJ3DpUX=9Nw%s@#&chSxsT9mQeBhkU%^#MixUeGDq?sA zibohtR0qvka_XPB5yw?SR4&@1KVf1nxOD_@G#ZSSGi+D(XWp^|lpBjcIDsZ$Lqo z-f8iM=3i3+AHP!&w`^L_yQA~`+tfOLT2Wh&Xg4n1Mt=6+0T%DtP_hI3o_49*8HzGDU$qmeNs zy`my%cTvFI%qZTR(n69V#zgoXfz;O*$||uVg{h;EZ?oMCHqLJ@_yRtBse8J_mqks% zYvjnZ`L??S11u)JL@7>eF(unjS5aA6R#KHjX*?qs{-U>-!1XmI(Ob833VoH|G>*Kk zj5+blwk#>*w=s(4ox2o|?$p>fzR^!2T(&3!5h;lOO&2je7!C(tES-%y{?Y3((P0m9 zOJoBABKO;`KrU*mI;bg_T`s+^+(@sJ)6PuEo9oNpN}D5RyQ$Ehp2GPJ`Vw1vedC`J~tn8 z%*7hdm@dSL=MMkqgm-X*?>>QdZQ+uhA8V=JovBS#+q}1%(tWgyjd!yt!8wZGR0m;j-X zX9_A0Ap%8y#m56IY;;FA7?u9Hrq^L0$)(2TOG2 zatoTah^-D`!um+rFN&4;Z!R?P+U|x5eK)1~%q{a(GMA7F(IE>DjZ4?&Nhr1)Wb5hr zU{T!7`Q0b7bBQiEegNL^0kVKUy7LA#b23n>#R`ZhhMuVi3C_y3OwDbt``C_q{``<2 z8;8z3o-aXRkCA0lYmX>bo0h|(?j-EHE#`VyW1;Q}J=}LHkN<4lv1$>LSPSsRaCObj zd76EN7ZIv`xRyQNv!Wk07U>+hVs%hna@sXBF{qY`l-rMq%yLQRp*C$JndFzn1t39a zadYH7x3MQ{k2lnX*(+iunji!Vandi*ITbl1r0*a%W7`pHdNjG+g)2g7UJ63kJBk|Uqs&KW*)$CoaynYrdE9fFNyI78re=>V5(Or>wS zdF;jVji&_D=M)(pl-UTqMU&_>@oai~Sr$9ZIeL*sj8BvXOZR=+Eyb1s7&_kGa8nl# zyV%&ugzF)r2hguT6Q$sDN(I^Sr7fGSU83RT*z_w9`yu6gKA%3KC--S~!DX)6yAt0E znt@VpANFVgB`Vw5+HY+q@t&{%wn$mPn^Q__YU(Q7KlH|~C&B`1avMaD#XFUXdb)iN^X*`w^TTwd$X@;RhY>N z&*cy~nx7?{`U&1-t4fp0-{eoAkO;x~Zk?&qHZ`P1C!FZE|3uXh;%`fc-&5$~tm~s) zQUSdE)2GiyAKjwpVP3Xc!Vgp{5%>#LtVE@^XE-V+g}02`UP@mxc7KFrryxDaJ1d?IL2aI{KmUFU0AFI<9pcK+mXLnF1yp-`!@g41R2d0hzs>PC$o)r3I$p^ zQG+5FVu!?uu>TgcC(}#F(jw%v<1lOUXD>UZw_1kB@%+#FVL(`{MoB4*_%fW zBrg`8uAqp|ecjm`_0B#+cBXy>XN0JgAa4TBJs;Fe!%4m~_-Tbu3;+=URJq^0PiTZc zU2`9Pdh60oC|HNYiRpg2@vhr{VT!`PG9BW-=zj@oj6s*RYtJ)LADDv*MpOWKV|Ta- zJHK7P76ZPVg=KE`Pp_dQfY)0ajy?k_t!9$#>+X{^*?PZ|epA0yQbMYau8Pc)N02b5 zls+Zh;a%auR7@?_yu!V*k5gyy``wHi298y`@pBzDawS}5nmSFD2Jav4jwzo`rtKoz zT+%Nu$1|Zk$J>28y}P${htz0HFPaVOadyN@cw^HvAA0>0Z62bCk|JJb`z$zCm?P|-ct$}6 z>&!O2IbFECpIs~XOM>A$Oi+GLSg_m9@~rW0bn;A2kxOc2sj0g9En}`TEO?z-dih%&u`tWb-Q-<=sr@gpajabsyBgpVmDscBkT;maVXBPP;z| zBEyKB8ZR+Rb}zU5se3FE zRpoETto`!hrNSL=4#k(#ct*$D;&xH;rQdl~`PFM}b48Xk1XXaSI-qnU4%i+kxI1(h zYh|U*6Boq%YY-kY4M@X?Xc_RA&I|$LDy6<@u z%dZ{2s4~U!gl&BpJwrldNl1h^dl3=<2=f4k$n@SARi`)h^s2T*ckL6XikAf8+Qvje z&KNtz!wAC;{sCXwq}kpkp{Og+mKEF5A+6z7un#PGtxjVcMZ3j69?Jc}yG3>c8aVaJ zr(5#;z~zLjKq8Q)#@p%p4!2Ff(3PN8cT~XulPYr0|i%+&)u?oT6jM%#>cqO5WUU5&*2DUvB&@5WF-WlrY^D$OFnB z`tRMiH$Ca_LZ$Qx=rY;d9_~tR_8+|0#vU-O{?~IgPIQt5ejJvpDa0Qy#IQPoi>4z>G z>mg;XKvEw8`J27=PxmlLVIe&vQ3#YH^&dU1AF(n^x|uQsK#Yj9VV1Z@P3hmeBsxfN z+>xLP3fa)yAYW*Vz`FuH!Md!0?r6euPW0Y29CARXDo|quORM8+C4jC+^wWj(U_(jc z@>k5Od{Q(IY&LQqRhrZyUd@=2X7GOrdiXzzGQPg=zei#yR(E{c=;PFB&2Nk5PEp!4 zZ(9V93M9>s3UU=k;?GizKRuQg>D*F=76ESIq>VF*rIw|~Oq{`q@c>J&AHIpDRHIhS zXL43F->rFgPUbF`PAQje4b=yBr_jg&_sKJ80GqNEuvY`e0eiw+JIv_*9e{{5ykxiFA zK!kN5^R)1g;{+!C6ec~5YK%p>XD*xlY|mE+R|&VNI#8aFI%u4K>A6Dbx$4>pM#JF@ zgii8j3($?@nrzo(`!C|QwD)q8bV~^T!~CCnDv`vORyufujmZ1$V;OSMxjPOAEN!y9 zGKG#PKif-3P;1tkAO4r7LWq+ z!3T2mTZptH_s>`%XYjKMq0nou!?h>nzs{2)%r9#Kr^Bn#0=M?`v7gMWokpV0@-~@- z^Ciy|#=pI>Q#~oU=mYQ64D2yn9!mak1LJuactzwgKNL8?hA)1kURJnX0%`g6W~ixJ z)B$vkCs7=%nu^SVloih%p6j`DL(BIdwXoAZ@RT!bR_%E3_JLW8t;G(Y)F=g>nxXKTqH`0GODp_YaXYe)4bW>=u! z2Z#JVT~h_gu`B>%OStpK8HtLWy(YyqDNg(T6I@{~QQ;!c4xQ)u@^2diI*GUfon5fk z#xJgFW%G7lgc*ES=2p;bXJYD{&sKD#U*mN;FDiVExC5Z*fW&L(E6`-wQrWiN4ah14 zz~KNdseX=GE50_bu1&H3I#Ud*O<6(Q%;-1!Uip*Q!(Mn3E@_Meb|8O*%Z(aK8{hbz zKC;Tn(m+~?g943rd+$RVLP%b}S>etnRT$sB#qXKh+jIJHL2Xm+M)|QG$l_c$@@c9g z4=noj&2akH)1M$Up{TNLP3uVWw2dA^V{r7{U?a~(Xsw*RHaK)GNNhikA9r@ZQKynK$lzCBEIQcCyJ6aYE zaA)EM*5EMg6NlI?U_TtT=D{b`2vU+Uh+De-IZ-kTBjFU8Wzm?}uNk;rT6jy+F*Q+3 zaddGcPv*?FM#9LP@`vQo8HKE23ywirTdw(el%}swLyVC!XGuk}lGxzAssR8k^be_^ z@@q`)Z>XYj4fOjhp&Ow7b^EWO0UBHd2!N6Q>Jp_U_2H3aml)T)P?FN?2|g3n^lP~k zjw@8!-wSb=yE83ggB^i05f!R!JTYo`rsk`9>SJ151{Y87Pv-E$4ia8`HsceF;*pw@ z5K|qa>01=iU)RL%PoceA`uwKoTO4{cSbr_4!l-qb)&@KKLoV3|T?}7OXuNCWC|nNk zExKjd<=%&U<6sv|AZZ%tuJe1ob*6@{P*df8A3z1Z&rVqh5xneMCtcv_n}MVK}pCIhv+b=X>dAuX7Tg=Hd?A^3E0wW{>b zJ+kKQnVmpK1507uMGiVp`9U^KOL7naC%rW7?JPQKd!sj)o>66NJqox&=bS#;`oY!DXI!K@ozrX77vNXOVT)?E_gOBlmvjex`%0!d zvme&3(M4ck(XXvyW#@T<+uZ!ZOdm!_S~;3N=&Gm=b2FVFC$c~>HH>YimT+oLY9vpy z?$90#!W6m8V!j6RGp4);qNe{CAmE>CN)=N7RXZ#7ySD28i(=^4o`iq%BnUI51Tu>O z66jx-O7+)0@V{CppT@5(l>a*I{Ms@ilK-=X@;3t6e?~3;)|U7$MtZH&U&;dwQ%s)# zN0PVq@2Ch2Atp;m9mB$<#l7K-Zx_Q};PJ@sW)-=}==i5&KK@Y(@&6;^fQbR9P4cJTEP3gi{w5U}&S#)o z5tf4va=wSt&nx)by2=!vEVrY*73!Mz;rVMj_tZ;YGgSgZaH>n)af@GtKb8-Pj!gSGME=S;ydmZl)DD73^s zKsIJj{Vg)F%Sg`vg4>sIDC;L3RB)9Mxw=o&*xS2QQ_BP8v>thmGab}~deWf6Ki>9t zsxp5mEpFNIm%<4T@&^n7eU$l z8FnF=!3#pmwwwIEUvAUK^EjMNzJeYx`M2uw^Saf|__hV}YKroOb@8dbCY&SSr#d>x z6pDBxP;NUZzB1;QZ)t1P%SIwn`uO695QHXrj~^9HQjt9ULpqf$PV91m%I9n%v?<># z{sqi@*rCV2j;e!oJ7){$SVf-8t89aY6z+VI>APEocNxI*$>v#BqdulcN5f(A7G}+j zVKu>Dya@;u#F?pkwMJJX(lRaA9bp8JW&83gP=OtE{ZVN>ZFB69`qroSex{{gs1wPS z&UlxZc^5j6p-ywh{c@KWN&l<~;bTYitFcIi~yjL*9c+W}3a0xhc1afYrb)1zAC*g>DXKD$`)2R*n$2a`5DC7g(UifpO4$QAT?o6UhVHjnnOgG z#5=)$OjD(W8)x?{+h;TUY8KRa!ah2yl6=cAc88Xh(@pcj!-IoO3zxJ{k{4=ZOhd<8 z)WhUQ;|0i@1Gwm$-~R~W=8Yr52jxBuT^!AtG!Po?Rbj+n5rfi zrkEc#p20cxpd!_?p|cW&H)3UNSlj>?%CG<0@|onuX4=us0}#^om*HA|?m`MJ^(|6M zD|)*QdV$wmtWDD#Xw+!OLd>MCxBHVGjT**j_3_FzQ6&`99b%|iErNy+K~0}t*}GCS zKN`^9)ou7_k>gg`z$qh>;;B+#jr7FN_YYXr@{4O2>Z{?>3@2a46i^Qj@5ncGaitFt zw2qWn?Jnx1k*xs&{6_N*1L^z|D48cf(iRZ195P8v0eB9+b`c5XVTF9DY_+n-Bo|nP zZy1dVDy8M&bzQL3+rtpp9miQ^`CbDT9A@!Egaq-F4kc2kk_CUKlaq6Y4ec=^*I(mZ zw+4qCgF^6>LQ!M1uJX4rzD6$_785^BT%?>~*Y(t$d3_=F?He6>(e5T^!V-K(gImE? zgQ91VatzLJvT;XfF5YOeal4;0*z_6tYz!rLf3 z8)}Q3+y$z3e)y5KJP?&kZo}=Q?yp#Unsqq$V?)A*cbnt=6T-!)_pm|2-U(vq-u||@ zpeIxjucWx6{Fg1JwoX-NX{v|rK9eo$^_F~{=|V?MV2h+HGgc7i;67rt+iE%8aW$u6 ze-PXeAVP;l^TAEk?IpasB$X&!%dRBhwx>8(AckWQ980-ZQO-C+TjvEwtdf?A0>T>- z{aU!f^JxW5p)EVinJV`_7XgmaCl1aI>YJtt;>VR*GL)Q%m;=Z?Y7cR&$)MQp%G+Jq z4*gI~ExmiXlTY)8+N@t?k;zrb_0;f94S-#a`(6*pr2CJrwg)U)Ns8RO?FUxrOeMg<(PHb+l=lqe|kJF|~?wK=}tPrmS;N%c_JqCU zj0Db;b3<>OwfR99yy zIjec=KCiz9R?z9oZ*`9OG~H}Z)JaX4MMqnfh#R$-{?Mq`d?Hiv6^FB>ohD;S$SeK1KVMxu3Avnlf<}Atc;PZEuCq()fTGH?Z^0Gzl?uN<+oLg7}_>| zRmUG;8#HDu9LSg#BKqIdnz->#n)jcql%J;ZKfAp~g8VJWjt{)~O6 z&3j$(Hjie=kXcoY0FFpc^b`3@A?!Af!+i6J+1qfbMyA1@HLaJD1XZ1o!HL}HgF~6K z{N!bm>FHvW)~1^-Y(G`B;=TCNw2o|8T*bEb1=_%>>{C78cq88O(|D`V{jOVG46%<1 zKHMM>lAMUISeOf-5H1QJUiNO@-&a&R05F5b)m~WO5GQ?aE&`DPw`jQXig|fxyt>Cj zb=`PbFUTs9l2JHqKw7wW%8lNRujtu@DmD9S?GtXk z6DMrBk7KOTnx0%Ph33p=QmqwYmM7uc@eRYcv_6ltH<)FQ`2@LB=m8>aSzhn_Om5l2 zA1E?*hHe2l-iI7fG{E&x;y}87VH8Rt%L8~+-KVFiS0Fv0B7{hODr)}n7a(hgj(FWo zoVd?rpv@McCv%rJFE(;TKltd>aGSaJXa~KGMtIKC-p#A8=7x;_1DxGE&vy{GtEnIC z{Yw+745v2=gXLl0KDefTKc!%9-eVP%+uuCFA=x{<3yx}EN%nA5RSJa?CKu}OpOg`P z(x<}fgA&_tBGpu?Emb|`+#Oc)%QBL zLpwP9PN=tnth|IY)ic43L{a8)F-wnGkNOV747_a-uS+w%t!akSs+72L!=y96rIS7i zb1ZOgpRAkpUvLlFdL*8xK`g^I&(-Kqpwu)=0%!0;%Hrz29LVIhCTrm|t1t}C?pO<4 zLt16HlstB%%>BXEA8L&w?q1YYTG4Z0eeSfF6G{TBP*{^SyYvXi56a%As-kc!lzWni zFfO!+Um4GSl%1ls^&fMg3>#ts?1O=LS=*02(&yvkanVWsiDJNAS5NLQdG-{OiI_6P`sQ;8c!A9zVB*gJ8chG6XcX=m<9aB*825Ih$VNX}w&5+X`GQ+vxb zcPq5j`Fso-7$35mnD$*LWTovJ_wo}F<_B~(Vm{hEco3^?<5d16Hky{H4rS*fwR`)` z=X>(|qrYsmvgD6SVB&u>oM@~V zkrlJD$=o4KX{nii8l+0u_v&Pk{Xu_Mrl~f(AE#h#@!?EgtdF&6X=Ti;u}fnF17iz| zK;UG?UF9t%*ZvzC4k6yQ@$=AZbFS9SixpRuu7|I9^&`*5%Q$HIr23__Y@RT=`j^?t`#8^a9>+7L z)E*A;S!pet_~vZ_U}m+){N1hOclkIiaq37Yiwf^meOKHpN@}=n+j{>bG6CGxiQcE$R1BbjXGcb2LB>Kg=JnodPFehnF;R-Zf z_-5Eqz|&BsVwCmO>o{JJo6j`(txH}pnTCS2TD7Na(f7&;>!+o$Vy(jT(9T2X5%4Iw z=fu0?F!&nQrM=#Y?majiO7aU(jShMaDhM=OECfHM(4u6e;ao|M5F?9v@BNBP>x?kp zjd=Y+n!8{-Vs1H%mr(DATCTCvY=GOh@vK5vnGRN~E_PMuLb{6>)x);DOmkuXWF+=# zomOKKlg&MI{FB6#^Y6Z2%o_FGo(lb1X_X1V`-*1x?rOlCe+N^vP}EWnU2aOC*x+Ji z=u!m2w{Q*t74=(s6;2l~^}+ZDy1srA+ESSc!ZcTJi(mG*J=O7HYswm^YW-;_XdP>~ zcB@U(G;n+2(6NhcvM+=70nhhn`+km^&}Rb{K`m~kLF{>}P{J;!@~ubo2W+ikPFM<+ zror~Qlg&!!WmU0-$a_sL-3^)z%w_dp1lFLkS}Y=Pq%|Jea)DQ6vo}A=O6@z|Q~oge z^L#{p|FgZ>LJt=B&Hij-cVkaS*0xqQdbPNoVzG8E*ZHvV+A{-+w9U2|2E*Y{PX3&z zRFkm*(UjCAqw91L%weu6U zr}$Bi1*f~y>v%|~GPY0H2^9n$LIckeQ3>&S-)#!4oa1<1@0M7%Pn%-JHw`-i=la$; zYrTJJ&)ZWmd!5;y$J0c`jkB)kGzsUbMNQHU(OPM+d1EConTkW+lwQF0sM?1uVRSfK z0J??U%UDWb*M}9+aU;{q=!oD6x&68J3bhI7_v*A&QzqHke!C88br5ld~5J7s*Y{S2u~4!Ua7 zI0=4qD4oT3`w@@5vYKShE}$c&U=?BRIudFR+}}}ZLecMQgD0IT8e2cuM2t(^gwA5z z3cmE*8uGweEQddhT3j=$FMnV>$w5UE66R%*Yrywl=K-5pq$K%((hub@me7(w#)2oM z|7=C{|EBS;gXjMkB;`LdaBRAH{s>m^L3IkXhx1N7pgLo1{p#*yGqR!V zLAew(N2r_bd(rhMLCcUnZU*a3*mHrvQxg3gjgPpPQVi)H$4l-t0#_h1Jvn`q1ru~P z2Pe=C$tB(TyQRfa!+8ZYNo-sn-iD~AZ0{9sHQctLm}{IEelfSDVt7l4{2e6SmE8xa z1opLIWBc-o)K2<6(b?3~TR?Ul&Q`~YZf40&Pm>2QxOOU-b5?pqqf^90|MhwRL-% zyoxv&Mv>Kq^X%J_z1g`n4Pc><2*2+@*CxRi^UC)(dOTQ(w1V=J^n)3viD>GlHXs9m`VV@Kks;nle>@|I;xLmExc{D5av~DBB_o5#rHM}0 z{4Vx&<$myeh-JI&=w<(<1+2{-hZU(ny*f9e-CPOTVaI^7_t%j&#hyD=EYwKRg`#sS z|#Jl#Xv6u?yomE*_BP(Pc|B6js}?@2gdT<9o=ZdcH4xZ`~{&ZhRqu zG@LR!J}!VQ5Vt@LEEFmp6E!uS7ec!0(01~cB%Hl(s*tmEf^I~N<+HtMEjM}OZP3z= z38?>dwxexI(WKJ1OOC6iFM6U^NKL3d$8w#SdCE$nU!X=R#*Cg^P3e>6o1v%k6D@{R z>1mmyaP6c@pwdsn0i6yIQF#eut8nVrI;g6c$oHL!2*T3s^zMlWW7YsBW=!}MX#I8N z+lCJR9054A5thmq<^$Vuv^h1r0{zGx%k4rDR0oBq3bnYt?4<7OfrhxXSpiA>wK6Mb zWGfB3LuV)hdEH5IsG(3=3#Tl3h<`Cz_Id&Lq_d=@RPK`|AKh3uSQXW~AJWYHTC3!~T=K=-S4g5Ql=#TUG;hlCc1Z5qr+n0jZs1lZQx81s zBRyRCoHf6pw;5yl>v7yZ_T^3Yq`ub-uPR&2TK zyrzd@Z$j@Ob0ZLO4yVFART2zuu*0*C$OaA{)#XZz7!Gq=$_fiwd2^(Nu2mslS$23m zHXqh>DVm8ER++Zwo}JeZ8}xFFxm%JO95tsCVjHEyLZn~K1Kk=u!6#^EQ9`oR{PqquEMG_nZGtmQFL`Oc!zaZ2)RQpsHM8;_RPFZf22XIM;Nff zJ69Wy_ZW@}MG$~$O3O2M=>X`hV#I56xPXmruXlbLd9L;qDGO97qu4`LO9HiP0=L?r z4Pq;22`+M{I7q-C|AY|B6n~oS38nROcNwE&K**Rf9DwuPE?sS!2C8yBU;oOFqq&is zR*Mgz?48>^it1)l1BZauurlrsatS!M91Wj?qHmt=D`%e=SJ@`|aHV17z(7@0w>H z#fhL!K~TYo>WOzQbgFU$F7>>1y$Xrrha{+5rx4Mb?gu5+SQ5ALxUl&!luZJ4$9T&^ zxMIPthiA`|?BQw-i(fbougun0bS6fIpgouN;7`WA$Q$7kz4!$==nf5FPq zRCxpxcKee_P-rvE`nq^%dq-csxC1u}eM9%bpZ{vQznX8AZ>Cz|)Q`>S>xdPl&xaXo z0aZ;q*E$kye0FJ)vjrWCLfTtVPk0}Thl~8c3HOY1C0>JIuR6e4N1r20G$SO3{2+-= zu)AfI+;g{NO;_0`3ngO$lAP97A-~unCOG$fUOCk@jK;~ugnv{%kFbM2hwwA%K6_mC z=7icQJ7R0X9^H9UnoCNU~73p`Ad%cvOAEiGres4i@Up7p2fx zF~DtO5j201J$Q`Wg1@+YS71J?KGj0q2>Qzk8O3c-F=QiPEhOL6fIQ zahEV3>xcUjVZ+1AQN1DNY@YpS$i_lvfD^LQ%KRE^avO2;J1IX>)9V5<5W*B(Z}Cdm zxHQ}~m`d5Q!=<5dyZ+(k+>iRZF+^`=@^6bMA5lk6hdX|91-dtJ&X3ppfuaA3)sxOS*;ubYw zRTd_dCSN8lUAQdFi)Fc59?eypsnu>yCYFa~$ay*}&;>{>UoL$owAZDapVDw_pw{eE zdzLWW?FkZSNWxl@&~tp1mNZUdY_6P6TnB9OZ7+@){z>zgcb`J{=-fYnz`YX!3vwUt zAoN~7l}ZhmN#hc1F#A!Nlwc)uPGir{^K8OtSyVu`n5h5$qbW(M7Rub?Sqz#Zus+&F;eYDf0&OsfEg zdMj>mBS$gw6-h&4f4Wc3W_!AHtYvpw^{=jl;opJ||3mUmuJb(puYcS!x{h;uSAsQ$ zNv+8hXhF5kQb5YU*2eDZ3T{;haDp`=w?wUvregaqN!-`D*c&f~LhB zsWKqvR&IHz`_s^9M`2QgcD3WO^JFW0T8e!7(+iyE?sw(%G60uoAhSiBuX#tbh8x*P zkC~`;|IV4ef{(7&BGlJT&h_D{FN<`^-9D?wQ>8%FY)_%EHQjV^`@vS_R5r6GDfKjo zE(d)yoLGLsbQ0b(40V2Y05vk;BQDnbS)1c5_P*??f8p9_8RcmFs#!PDFBOUeGBlZC z(z|B)IZHXpK_L+hMuswIuM(mPrgVPvA*NbRz32yH+WTJ`M*UJ^V)EcE+7Z@xt_xxF zX+7!4?tFdBtVF#0roPGjTZRemb*zX33wdOII6QT6;jf>{4qt>2{6NIYSTfgUV>#Fr z&W6-UBGjUL_Sf2~g?>azM@hS}luyto8P3rg9`fuss77%*G8U+B);!@yFsaOOmwFB@ z<<3K?ET4<#%ym(blaunAh#cdl1*23ph%wGnQ?y-Ng5ermV#nFF5!O0k zmA4_kr77N@$j20!G1HQh^c96!IHW#W$RDZXo@$})qU4iNCReqHzrWXZt^)Vuwqb6b z*3$mi54nq7D7&v4m$TtUDVf)8TpOVp9+%Gb6dPn*=TZl2ReoI_0|@d3Xf~2*F09rT>tk||^ze%?n^jC# z+)qZ(E{=B{r<~t<{O~`OGe%-<#UCrhP+~8!QdQH4$+sC2rRIcg zZ!$Bq3e%LpAH|P52TBmEFYDD&x&@A>)K+%29ZK3yL>WG-?9zGCmI>sa=F@EVq`i8y zT{0!NTrYGW0l8srh;Uupu~?u~tK!Uouj*)@^V;PSb3ZcGj8t%hIN{;$_Glq)FJ4N7qT5QNK<0oDu` ztA!YoSj*dQHLB)m-agSy7I*C7%ztRbK9}8gtNeq`afRi&Ylk-yEeB!WiU)PIHG*Tu zapD!fNpQ<5$ZV%z9r!57C!A;oryu6xFqgt>LH`b3(Hd2K(mI+buCg2Y(ZIYpGOgT{ zd>)KFhtRO`JsKbO{F*WM;PVoBzg_Bh4L@Vv=2)~wRgaUobdr~r`qx5AF1b%DN$ z*~D4(pP880DhOe&FA8o2=@uEzz zmI(bhwkBA!%!kviB6 zPv1N25o}hYo5EOw)7IHY46|~xvQgr3(=v7_jfL5H_UWah)U%~Knp^##FxYpNt;9ULtZ#13iTFiDT3J$FhpSs>jqyG!&3s)zEjJW6X2{-ofNXb_d*Qd_kCoB zRn^MxVwy6EL|22bwC(5WEPc~e^NkfJbx+rggYu=Lv7}F(%BnvhS%&K87A5M#?TI6- z!Fx$q#}Hq4D|=nc{X-Y$mhcv?_l}&0rY8(3C?Lf_Mq|m+U9&Gg#_uyOO?#l6bFN^; z5_o#ptK#BEjE$8u>TjcP+T$taYK`wQmcbkE!86pQ2gcRA_ru}vs# z=t|~zu#D%i^`5hA*;14nUarM5p%~ENSo?)WDJgzT>4ZM7;bhV(I|J35E|V@wiw_Sj zlQs`FpQ#(vUW(a;?-B<NndD+M0DH2dq23#&dl@IqKF& z)RvTwvg%ImV#Je@8QF&GGUp+LSI4hF>RRAAk<5~BY+kmMA&nPHby6})vA>pLcvx3^ zYM)0k*u~v6 zK{mfQb??VO&Uffk1W}V^zS(^<8xxbRBJQTPZ}bE_*5h~$3%K}l#xfnVHfL$8E4)`- ztygDCh@bUqQ4tPT810*UF%@*u3=7D&D#JwQz{bnDH%Quv4sT=TsD5_Ntb5qv7M)aE z{p1DBM(3Sv2(!=7`y-$vgqG}l9~WiY$gZG*B~6+O2R7MPo88!ctfhf6Jo&m%{zNJp zmLMv#0lH5)Ib2dgAmThh&TQD7E0Dq2bDiCG`-7c!XVVd?3CRo>IXRzJlEFzt;N&W5 z(!rIrHs#%pzB===w*(=$J3(aD3$972t70WU1WLVH- z5%s*ya&&*D_nQK?IXVBhEX0s0}8tUH5P(tZDDhX=TBPjQmW<&o!_P#nU zs1;9{=Dsvp4sS+55g%T#6si-k=;Ed}VCpOr{+2%1d&60wS=TRVBto+UW3&TMt9CL1#Mbd|EA0g;@rm+sG*(|{@ zf{3A5jD1JO9{uG4m3a;IP7UeZ^e1X*M8oj4@4C9Kbd)Y0ilXj)a^V*6p^%(j(ttOVG=}#Xs%eqU}Rb9 zNpATHC8*iwh?Dt-$?3$D`Je-Mzj)u%{j*QBS9+bcy%D=O6x~?QbVNapVTcMI!I+!p zgj+$5a!y!c*BLkYkjQyw`eYCn@q;tILwr z9BgHD9nuQ;{bwwe>KYzdTp$nhtrhg49&Np^8V`f?_8BPY9iOmCQSj5>kAg(Vw154gDBh6eFgTq2YJq^NGC^-;*^DoTa^El|WV6lP@kHN~(It3Hp@{@N*dW4+Hkvn%7sg zv*_E;C+aiZO(us%@oezh^TyJB@58ul7)^bL>W^4jP{Uj-*?077Jl`GC&9Zvr>E&u? z7YJwhg<(p(w?!Ji|GW(uziD#e<%e$G%&AAe6gy_~YZIo{v(HN5K|k)TVR)t4&PSoo zlzo9|F{{qIIr>tIjeAbC)baF89t#f9*`9PFiHe5zf{ zQ@P^?3W8u7Izoe~7oC zhraPbG?(<_riY4k6)?u0U93?)mqS8o8QSVt?+*{vI|=MbdoCT#?f@vjiWk3!#`yoy z*v-YFPPr~lMuRrT9zE07iVic~7DkS`3DxjSQP^iiU!I7FSjvA>c}VA;e9%hdYG?0= zgJip*exk>PwZ~%B7p?of&3k2!VOu{OiQ;4D_c{eLeTR=saoWe&*}W`KPBE@IY&Xk&|u*n1}OUO9k>A&T4uUP`U^P)d%E80 z>m{s(zvo(E!$B#>Fd&n%y>jsio)#1qlzCM$d%51m*j#sC=pSkidH2@P%EIR=;l3R> zf&@~`iKo(JZJIOQ!{OJ2EX$ z;TZ!fJcg;pjt5`7*SS1Aal-9&%+nhCQIova;tFkINoM9F;}Xo1FJ4&^`rplLLgFjx z`s?RnSRGPdkWZ-Jtb^~dMYC96&GSzXYfEl(w4Uo84<0XIlF^IzzG}ls6le#^VZGNN zb|vaSAF+3o@IOXAeu{XAXkK@YFO{)_hh@<b+Y+TESN&Rpqj&BAELauy!E?)Awwya`m?Yy4 zQ@HJ(cQ>(O51$WW&h%B;@DGX_EoIzduDg&AiO>+e`ykL(z9P4HL;QMvOEa?NT5o0< zqe+teC+Qt2HBR-m@dIX$EQ) z>f=gzrL`);dO;xuu5YB~gy7T^Rra^E4o0ZuUoR*MtLME#>nS9*OOU#$5ZRu@lUJp( zROIBNv0H{G@c;ut<9_U$;4!p!cqS@Z>vsCtfpuuxf%;U@$+M{^+%f~zdAoNl)dKSF zj5Ozju*aOpiAT*Kkp`wTJg#gxl3YX=Hh4E-*2ZnhlJcdU@wD2dfaM@{!F6cJeU>L< zkWD`e%dm2FVrTs`WCp%M(Cs4u@H1@nJWeJG9me6KiiKRS(sI-dvhX#oh|o?fMgX)U zV8EZsMaFhzqIl(JkQxt03UzX;Iy5EdNAa|45%WqDJZLi|`1w zY=0}ymMPt$i}&z6cGgc-KoLk(%a3TTjvQqTCoZzGDfy zxP!>VYKL}<5!thc-AAHzTXpr`o@;+lIpk|3I=P5Hm(5nxu_(~P+I9YEQJ?P3(;jek zvF>g72d^&*$!LYVz%oS0o#aTbyUg;)FXL&za!&DLCYyYkLXWym+4&c@?YS3hdf<^5 z9xkhU!9yCYjCb5=B2MrJb(1rw97_WPm~jNj2GHb*##-DnY+0_I$}W{ip2vIYdXo@r>^H?S~G~zNJ}*_-&Tj8kP^E16JCt z$cm~}wDs>d;GZs@OAuc(7+ni;7Dwx0I}5SH+JN&x{U$!PeG+lJEXfnAtOx3~1IOE( zElVkta^3=6&xf>uWW|#?&i% z&*#Nh2Y`vrB=XB8{5dXR{HKY|(68;k^_)S1I7``8du*tk*Cgt~J>vMN5vJ^M@w+kQ zNf5uc$;Kz$#=oJ7KZeQUrY5g~;R+i=MSdN!l_!?5xyy@f-^O!wfIYJ&0u0_ABHi%=)V0C#5NG`_g$!6Z$F8ui#5l>rkFy z^<<|o4&oKi(Z+piDy|9d#x^1FzZwMos@eKCi_HDaB6avxAo_FZu7W7*+%t#9Z3^lX zDRNJW^x#jHr*KE+?^=;*P4xC;9xg6W@uOPezG>n9LI15P*I0)Aea^0leWLrOL|^Q% zc&n6KZ=1ywbVKv`RUMk3mx%)7=3%54*1AO1Umh$tPerff8@L@= z>sqwFw?%~CrcNI$2yU6Oejd1^6Uq6e@4-TRMWL8?sg((-b@(wKk1$D#kcPt#4LU95 zotIjjrz9>>FiZ)F3gDkOp_k!b z-A0VatOt$`=TVTaLq(=tYUGXpCQ?k~{%XYQZl(V=py@S4}AJG9@h z#R#zp4=!}?jF$+flt-tg2%XS)jhM@OBi9tRBG{JhZ&)VuKb}c!H zoff?DEHJq6#XziDu441h`N9}BEy(2=Qf=Ao@dFEi6$ zP1`e0O_UCW#IVk_y_xAX+YKnbcDOLtX++GQ9hPgb%uyaG+x4Jd;k>a~EcFz`hiduD- zpf2c?-}|7_lz+E%cpo2K>lhEqgIj0nNt=khSdF*&+>TY$SL~dT3!h*X^g4X-I6mt; z?1dRvwtscdwj4rq{hM* z??Ub^#U(=cH*LJF+0SP_hDV?QE80V8Mw|;-N2vhn@x?DsJtX_ps`Y=>SStl zT~AP3lZFZU=nU}c8`NXh<^UFXJ9?mebrKMHoMpt@Sr8}q5syLl%L#3>cM5+o6m~HQ z{Sy9bCJmjSR(75`A#2Zv>)!joGVo6ly%06;Wfci{* z_0!^EiLXhaTt$V~5IgDpCZJo6Pyv1OzZ-DC9VB3FBh#^AZBj{%J9Z&HQ$a){LsTu^ zrBKB?PZY;Ilgz+-+j}+EHqz~Nxfjxz3B&X~a&mW3O`A7DoJwh3W z%WWYo3LRnpGQhM=;WaxcbT27BWMf(=Hg3&m`0}|vKNg<_Nxd{!|9p8zPaStw4j0RF z0(-1&&5)K^f5Q$AZ^emACe*#WRH_6~KKt)4579VVjluSp56gsLi)+x690M@4TOJ2$ zx*SDh*LFGmYUqu;fQQdI(mZN)ReW*_3)bARWVD``?Z*%^olWZyfqx~|EdstgW!RLO z6_BogRjS4Iyn>HTu0^cU0d?{F|2Iq*aRA($2GV~iZw!P_rQmh2u0l^V24N186yG2Q z6y72l6-Tj{OxSGd2lyDvhPG)hrS=w16JI^MW^){5>t;`D58E}MSp`T?1RyBemzVIz z-E<^?KHALd5HrGf1UA2jsYeF@9`YX-%|%osfM{!6QUhj9ZNY#Ul>~cFGSyv)buKSN zfRyS@7Y$fZwuUW`7Oq3(hFX52Fy#dF15EK79^`E z?0c;EeNa91k4wV;_<{C3q)v(fbDm4n;EBMsyUSnB$8Nf$_@*D2Ef&A=Sml58Qw`z% z{0_3eb_b$K)!%pmHNVY)ieGQw+w&UnH|DkC)z78?;HiCWVddt2{$BsnGwauS{fqSq z-}%tgEWI1Ii!<}jLSS_&b6QYfXs?M8pZ=7w8-vR=8=;z~f&uoZeI@i&+H(rih{;97 z-E5nO$64R#+Xx+_54nGQ@kp_MxeI71GzL5N;{Yzn4I_meIzpN&kK_F%)QA%_;SE)6 z0tLDh<+n9SO8>_zb!~2=^q*-6{C_GNBY`_K7Yfb&zFxGuq$8O@{~jqd=to$>L(WWy zmr+a@n}F4J?F#tnb_4Cj9>fAj%!~%*es0Dt_QFRmA$EdVKV)f$4j|JS4c)Y7d|J>A z$w^=U)jH&}D4qajcCX{hZ>0(->UgU`QfeD_zr{aa5Pkyo+1(A`C;Xet0KrezUpA<` zg${LyMPal@*m@@2w<9--C2@^Ebzk8%nZ@1#Pj5!}`oI4SL%5;b(5vAt`$@m&MKZGJ z#o_(Sw$%CdPumO}{yX0{6$;539mw)Yp6DpZMi9t0vfdqZR=qzQ)E9@IX0{4!r1H?l#WuHSC4R7W3t|BV+O2aRhAFDp2Ru`tD>NWi$_xkaVQk2QlGJ zdXLSN$i?g1-)bkv?t5fwG|WjJecJl@3v=b5l@T9;(OsgJSk`>d_M%?7>ALqJbCPJC zBfFbUwWQKI6FLXaMz-&6 z`>^!lQ1#RJmrg!Hnz&mL&g_-McqPVnCQ8OKGe{Pfoe*Ifml?mpXtzja?!=1fP}CG2 zd^cFEvooihilN`4g$~1}!hvfWtQ)pp+OewZ;=Mh)*Y=qr2%yo_%z8Qe6*rV*g8TW< zT=vH@@8xmF!|CC-#c!na>SLMmo)on>PKMq)og3=HuvGi<`QrE)RefB0jhgsUT#;F1 zTZx4o`-mSYx%IST*OkMeUUyc->-{RnI5qr>lQi|IALe(M?!4q>DN-laEXz-=+VzUO zV2(>KzL;d!rOxgZ!>Z;6qur0s-lS%-tt1uD4r59=-5i~3@`%QX3?bdb!)~DCBbys+ ztq@L+qdFnC4Wv-;Jk$$%7L@&FC~ZYqoMFa}()XEziyU(T*g#FFH?Cy&>%qa!%V z&WRE$o24KkC><+J?%xum6Dl)l zE|faEQBv&gpQ!&|t33ICqPKVAq{w#4r2pym{I>+dd0-#xk``r>fk5eEQ@v3kD7p?Ep&P}PtFC%4+xI_rydZ>lg)%;f zB|d&Z z1}z^uC;gPJdo#xlyB#zu6C>anU(DuT0YwNoP1bIJz;=5?H9Nn0CIR8wuHoE}6UXd5 zo;CEFt+pv!7reZik#DV{x)kY4=M?rzb+7J$gv%nrv1&slR{a?9Dz&ia>X7O@rMf_g zx9@5NN>MnuB`Bmq#UP~I2Py94IT5VMQ9C{JVA*@m<5|~B zdiS6f{KX`O5Y?UeEjbOKCJE7SQ|=|6oq0H>jV{e%SG#J=iRGjQ4D;Ht^qQ0W^y3wE zywvjKmsHN!5clski%rX7O;HbPY-zl?s9vf-?EBE<5S>Aol_E4VPh&H#NrGk~FIb0; zl@3e_)RNS-hzS;QE)I{5iCWkn%eHth5!lz+Uklr=o`Bm*JwDow-kWybc*atCEbF>J zlDf?S%4O^!6wZalm?XJSAZ?*;eWzuOh#IV<$~;tHrDcc>nQLduP_ zMGtc97fn}FACEM;`?z_^e{As0rKnb{Zb#a1yI;p0v_MwZrLw{s&K#Kudn0mhbTQKv z3Odu3>Lczht$9wy1rV&j=>hc=wXYd1-D=!bm0>u_Ux_Et6rTfej@oqerrG*blk-<6 z2asjn53i_>vUDUP4C^EvWx74RU2jtB$ZYH#lkh=orQ+E=1i>QmT-nPP_xl$P)3k(_ zJCI%o*3wQhRI)B_c07>Q9o^w)n!^>7^}tiRe(_-g%)rL|#dL;1dM711kkZ9dkB@GZ zNBRQtyuE);vY<+GpG-@%;9b)9Zf?8aicq>1Q(IhniMT~-7lYUCBM1Z?n6GFUGqvh^jo>ExB4<o6lB~W(AR!8i8FQqiy@U+2j z$(1a1TX!|}E{PF|DmJ6SRn{m$n{a-=42|)QjC|#@9+VCADcPx0svEXX zryd2rs;A}Jo$c>1=z^oWYT|}5tX+?-CvN54@EK2R#!`&yU!S7r(z#|jW~x48D%THZ z6(r;?Kxq{ub{9+DSP~1m;D63&LP!Wf@Tv(dc{P$=&;I@Wf|Cbc1qm|jI&5+i*^SlB zlsn~R8O?RBu*Fe8*V50rShuM7Mz3n)_Mr#Dp-D?6;BKEu-Yljto|(Rh-9H}GRoCR> z)L0 z!e}GKee{?m8BXl86CA=L&Qq61(%mzPlT5I6ZXaaUz1*?CJ7`4GzmJ1*9pX)4dRX$@ zl22_a>ogL-GDewBLAFlpAr>oREW_`K9xGLy*{U*Z{)DW8&? z$l`-b8OLqZ;%EGNEGOBSDued=@ZyCGmH1(SoN~Q@AhFeAF+$|$)jHQFiofisl>3m7 z1&=jRh%ifgo@V=CkzJI*P~TXvb}!WPM38y31vk^Gv)sGnP!_k?QiCcB!IVGu+th@& z_=!;kr&NfXVmvtp5qrw-@Q?)^o-;bAeXJKYom~zPOM=SGXloM^se{;x{L)))y(XO! zeB-{CW9?rg$^$Rz*;c)he|kqACN+q^xe4f|zd>Bx5W&9*0gT`vuA4#`qv!z-yC;fDE@#Kc^BBK z`?+&B@TkB4A~(7Xe(P^jvOh&`&ozZ+2HlpKBLy6OT_i;Q_h?Dfo}}3(sI^Ur<}IRw zD~PR)F5pkPfUHtcQe%3p%4+J~d-_Z2LSh%=X!pC}x6lVvq-#$Upb|2)wd71y1srP_ zj@kEHqgkU#jW>cj9~rGbB_k>O>I#^Woa@k<4dBq`4ds2%-Uc55wbemw*fo{F_R;^{ z09&*SM63UUCjH0BzdwncG@1xRV!sa?`{>)zm{iTI?!Ug#X%=J^)~|rF)CV)AvBlX~ zceU6r2hvQXF-UvFy@l5D;j?h0b1Dcvec7@H9v8Hj1e!x{RY-4T9FT3>zxZqEK3&(h zY<;j&LJb(_?0K^3#piA->Gtc;y@l@6XqrgCFf9XH{#R9(7s7DI@kR|?6Vuj|) zZ^MQDJE8%5EIn_A)7PkYYWHeTW!c$tCUPf@`?@Kv+7J>{z!RAb*d5$Tx~BoHoi0QT%AL-SoY_1>0_~ex2Y+H*<8%jUu4g*{@=kvm)%;KB%R_NyBEvsN z%5F&8ZFG9zx4Hi`b~AppvVy4I_5V?}nhlkj6f$kuCR$k1=(r1S})4@G`cxgYFm0yqTvp74{9dD8#tvxCkwIuV!QSHS9$+S}3eT z&j7qf4Iem)q`=_wz|gx8$T*TD=IlrSkZjtQ@Yw+nO7Wk4mcCiJJ9)(ol;io!{COW5 zOKx$S%f8B-i;0VbmcJEj*t7VW9Y5*dciS63$<~n2*DsZ2#u5K!^g2gSbTArTfmE+e zJ^Ehvh;o%{A?4)D#mu64m~Z(UNptz+!`PSdJ7Ur7*??wky@o{xU|9oJ)U^&^r(O3EfZ>c3&To^ zJvxt+&NBvG)2M=Kd1)u}vhQ)jRw;8EP-{LL-HaA`z6nt=o%L2b+rmxoNto3TBF z*<4&su(HF1#1L*0p|J0^*l8L)Rcj|Z3>({DhL$p{eH#51c>DG_3OYHcN62d`)$Q9( zr(Rubyn~mIMs?RP zAxUX}Izwv4Jj%M#^)E`%a!Tmz{AguX|$9E|EBsA?W5kowp8o%;|L=jl8Ai z@>EYmpPtibs1Tni$u!sM(VM7eC-awM*b6)+cBb4(2o8?B0cZJOFjK|O*xn$YSN6z_ z*XURR+hYcT^a)d28sfg7RACy#%Tc3sNCULrBd8(TgY--v5)}v$SJM;jWzp6nEO2vSaSg+4!ZmBU*!|H?mKTj)H87+x z%n$EW&Kcfg3y+lz?d+IAv!mRcCs&=5kM?dyvX>BXN2-5N2zh+&T0k5FHY_4VZyA+y z|Drng2;MvcfyhCJZFVSP!?Ia_-y&_VL-)`!H_XgfE-$q=Q=KoNnHKJ`qfLkJ;>$yF zc=sM<@nU3|^5jiiSc~naKzO62gehjj7CpU>MyR)3TN9jQY`vT4dPifS7Gro{hI~z> zgTu(O{w$j~TXg!F^wewJ{RD4U9Jrs@ws+Ul7qYSKdnGHWCD&d=vgbv3@4iwx>S(Pa zMK@ZbapoywKy1S1zFAin-;t(O=LfW%f^%KcrKPPzeh+szxJa<=-OW2;&RbJvrn(5* zIoT&eO@qV6|XHp625reH(soKUvW!pVjKnG8LF#e=g2w}x8MxcZ$b8AN1rsMx{ZL)oixX zUZP^1GGN)^(YfD})y61z1lZL&hJo9Y)bPu?zbzYt9jgWNYac_pVW1`%$4V< zFhO!gIIJBl{xrNx@p1D5YyZ05w%ukd6{8Hg3uJJQ86J{7hSYfWd2xp7SFxh?RqsM? zGi~>(yqrCLMzVCf1KX2rFx-`3hZ|1=q(bVi1Qj-^!=^)Gn;#{4OEB8%_$34pw~JG< zI_=Dwxv0rHRAtfodg#)XB7Yr%_zI=reoW)(sTTjdu%S$G={769V+&ob0^B&dX8RdO zX=g>Il$?|7X!1xE&VR`V{>y!(KKQ5Wc}dQ1)Q z1va8Vze#q_BpKc1L^M3}jJBcBcRO0*$JJC}dZTW_itqb+d^L2HbwwYzlWaYFDqbG?r|f zR31MH5bp0cmya>ot#YgY7L&mWIl?`bT{{Q`jXIVR2a%J~F{dcnb~$a;)8 zq!q1o4HsFJ^}%rEz;JLmN*?fK@fe+=b|vLyVDV zVoYHafIzB{5Ad=Lbw}qS7yyFYK|bffj;G}doZ2!SuOdm%zG}J{r|OLkk|^VEGm-TA z2csqIM?T^5{A}*do=E&{AsI@kKf_Skd`=va!!E$%pmw4C6zu%ojOj0q-{k+>>sH9E z`XIVEFfqAhpi63xp1jtJz9WhEf-G83?m*xMyeTP{Y3HFmEaoIUr*6K|^Jw<%cu?bF z27iP%Po`F@y0&)h(SxHpZSu}fap8V^WhT!Qnz|RVJ^0V(^OHtAE=M~*x_*9^%$#dc z5F3VH#j)1_MYa^fz2>@Gk)DglwIEO&pFWvnQp@hjpr6V4+%@)?groArUe9sIgK7pY za4f206xy=0nVzymmt6~{5Hb~TCY9{zAz-anO8Yul$D7r@JDc95!oIrf_TWyF>0nha zGS|J3dXWF{g;J8Le3Z7g3CD={rK85LGLDLDtG+mFc3(FIp*EtI;$+N5dm4G}#0rvU zJR#~s?J*S|=#j|r9bBLr@-hGV=#&XAJ_f;p^HB*(9aqzy!!<4#u~jWmJ7;EVa@(+z#|0lfcz9Z| zDmH3lc`9-^^}z(&0q6bg>OC%odytA#_*XD5M7#*_=y;e!g#TZR~L+J+5ot z!nh`}yG62^E&iooX?ym1rVP`4$H`k#ej<>d17-Y_)9H>y580 zoYJx!``5%?nf38C(1pEeQd}gY&0O{p9OS-2{SvUOVC=zFA&>gn(K z{0CME#Yu@_X>D3A3!V7R@Md8)rR|HBihPZ!NR+}_lp`o{{d%mtVHL8>RdpNF-fQJ_ zN+Wz}Nw>J5(6M2ckWkCxKmkj5>e3XK@{{(X^rboXZn&L(PJ4QRK4&S8F!W;#af_ea zX4U+W&dcr5-<9F=qqS`!Hx#WtE}N)zZwq0SlA0k7wEIFfYLaP=Y`kp*PK6kTB_TR4 z!2{!nai7EH_{ZUkvutCXtM!{Upt)>H$FuDV9AsMK`921hJ$3p{oRDLFXxEU&a~nWI zx>d0~IRF}B0?<&FPjxxbq7$NOIWju4&HJqI>kPK1S8iS4s|aefZ|6X>U>vXr%f7W$ zFnF&B82}q^u7s@R!17bsC#Y44s&~g_gEliUM2wlS{vMcoRF0kbqixt3Ezzr4M7v`U3rlwq+%q84yv*t z5m?a!CMc{N24@WS<-(_!kYjBeI7@XhqN7_IMVXyEwpm4g@q41sMp_4HZqxH>{@*Q_ zp9&IjE1r^qDB8mVO_K#6Z3U9jFuhwNa`!`{~VntLvP)S8c> zLn(2EZV*A(H@Q6UpuqdxaDG67lvyQA6pN6Jqq%Kjt6pu>OWrCtX)&T+xXsWS&K5A{ zM0C0R&69y*40&hW^z}Y{I|6CaI^NY#+LT6LEH};qxliWXNtA*1Md`AW1EH8<Ag9=RP*e;5Trob4=Kt}LT}ggh2mU@iA@@Zw!`Q^5nQXDMXZ z^e+c|D~jC@;KL+m&n$db)Z!;ZEF^kZ^iwh+=KihldB`@ao^07{&Xsz=mcp=Mi)zug zAyYblN(6NPdTh3_7-Bs9T%d&il`wl z4j{x6=Uu97slAG$zYZ-&laQP@p}1@AGGMa9v-E-(MD@G{Rp>v4MkG5w&FPd+RBV!{$=xPdU6|!;HqWr z;ChDUlE=k@{?c4IwO8$f4O>faXJ>4}GA0AjCEVm${3xz~VK`h0%I=4Gdd%0gEvDue zRi3^%gwahnj-82{S(qGi=7emyRH4g+;?K}*=8w?VK-#Y{n8;e$=v3Sh6zZ&W%aoNI z-gv&vqlfKtZ!4${V4knS>{rXm?#+&(y-{))yR%@{5r4A0 zb0_rHUVIlk^ApH6QRp+#GBDdl+|(`5QU4D@d>|olZ`pX9W!wU{PxJ2&`=<(foaAgJ zDSe8RA<*LByEF2tkQCmQ4ZAz-3LcNBz=~Y??1bGPEpy{!nwb$L6*ri%itfHThsVcfu%pZ3<=RQ+y@!!H*@J6sUX@3;BZ=KnD_w_< zsu~WE+ zYFjCt-{h!dmvYHt&*}HKyUk@4BpKF%Wt?d41c~LmNWYUnkbWFmAREN;&Bm0b+1AS! zKWSl#>^&33Ab;G{n`D9rvd<5&yIFL^E$`K#+qswBh5|w_^yPOV8d~AYv#Jeusx0LcD*g%AJXn3H1ef&_u0N31q!mH?c*!Wpep3QnZ5cseQ8ss{P#}; z<}Z4O-#X7fj^{6f<&;fchiE}lkgdo?Hn){~gHFh>8N=hX*GQI1H2Qftrc_fRWqN?k!XKOCWc56s&0soiw+tN(W(I& z$tPuJo{`SMj#&Ex)--3w)G^ub7sqh$U7Trs5NsFQ2VQjp9VR@QaC~urwyac07%bR_ zaF8MPGU_3s19+#b-Jfnke^N@Tk)La+mSkyCi;CIT)U^lFRK%!ulgNt-dqYSLT1VlK zbNW2&?9?*c_e-4ANRZmjJs86~Ic0?{CRazA9?t%-e*iL$P zEuo=4Po2uwZMheH)XZDdnMXKOw3nQUo%kW^7(!%qw%63H{YLIeeh52qTQki2rKKrU zho<&X^==t`GurBsY|^IuVM|_Cr;IU@Z7UwjRrcF@v_b9X1UqL_%1b$D5rpu&@iBN`xhC5=?UlxTL`eR%)Nwz{9{-Ft{`9yunVj~#aMT^RJ0Nk9jJf@p zNc$Hn509Ip=RuKtx#gCUsc`y>BC1G8)h7x9tJ5FX&`f&^-hBgI#h-1{Ub$EX5>Mlb z5q%*3yb_s^d&+O=(RSG+x1IgixiWWbWiw)o;>J4UGL9w18PJ~1al8ch`Jm`RIVXS$ zxe*oBNL|eKF6>4)yQ%{u)Lj{eFAzQk#70XH->pNQ3`iCo(A1JxnB=y4ayR2B!A=2^ zgM%)-+Bmi~+Uc-0dIyr6?P(-)1m}cqZHC`zkSFEmBAg)q*ESPa@AA(luF@ z8*$2z2L9#V8LPe%4YGauDKR9x!3x->4?>8)d*Gh_xeROdzY|C z3_Q#(IVy27hNqoD4_)tdf5h;fTwSbE*ISZjD$BQo4`>-WYfh(>wa5vk5ABl@^1L#Q zAW>T~g3CT>ei+>RqUWG1|6!6=RIG&vr54$&Sk!>+tvls7v2>p=gIVWv98e-FTwq%XpkCk+I?-~mWhAHc zjwD{fZW4tt@dsqa-q4^R%j{r7WyhShMjgj7vW9oH8aq33YL-jath0LmWgUL1Vg8Q7 z`6w$~+MLPt3$Ta0QM-g2bf$_%-t9OATN1*VL8;o{?N#fl!Q1=LOzeRI>1AeGC~YeW z8H~B>3~+t9WLWu%Jnq;QyaeWdjrq}ap*(^FF#0lwZBYL|J1YcN7vXA7oDLX0oP~d> z*q@rG+deK;Zt28wAc+;zv^CHXcVqwnMK9N(M-kZ84VdVYO+Dyg0O>~H4>p0%N<_=BBxm<>L#VA7?VA$^ zR8v&%qQUwIbPfYalGm|kTzEb_I1>6s#PHP?_~bMPX18jK)jXp+5EkSBRn|5`j9o2@+S<##8_hDB#L54MzWL1R z{gK&{I0uq;EsGL02MNz?4Y9!;nUa%kgXhmMdc2s&3TrkslVgzw-L{{qN1j^dLDWg1 zWWAx)Cm*-O-K&D>>4ClcsFLNl*fB^h*dM8#41yp~AD@D+|VE0DG12^gKt2;8VEW2M~;IBw2uaCYi<1B8{^L2;05R za^oO*?!Uto!nBpwzINKH(coROH+dzVp226yq5pUUGau?WC=j&PxV@a%Hsd^!Cnf5} zF-C>8dYgfk*=eC_qmDQHF#%b&9IAea^X%`H?iycVGQ^O~<0Hw&h4-$+8l|pUtG$yp zd{WVE>3?T*i{JZeIF6Jd!lx+=ws3(C_F$~@2k#GGpTl9msZZrkvTCyJ#zgP zXOL;q9V(0-ZsS%^N%6Jh+v5^|tBp+3`g*mCt*p{7jro-vez!Llb!{OIze$2vc?&$y zkLKUTCmYEN_#ERTY!Sr$>k2;EauGJkPIlx2k$^J53Dp7#N@Lr~bpEu}A4Wz{Vh_y;eHv5NKA{JnJaGI3&O!!A2(S!5?d(Gg#SUn` z0Zj3O>~L&p0#J>jT5%-2V2%nTA{xa&jXE0O$Iruw4+gx5RNOsawbP{mV@b9U?bs8C zc!^9#ycOfXP=eq%^%l%1N{nYIuVr#UA6>=Pqu+9J>yTdAjdxK^D7ou zQ70yB$pbJ?wmZK5YzL&j6@tTNZbrx2(?xC$De?C%3n`x_b$$yb`>}OSYsTf03#1@< zbDX5)l%ATTKX`**y^nv#BPjG!uR#%erM*tksz1?vw!rqH7ftGwhDL9&{s`qNVe`GC zSTr>>rI*EqC6D!hkCrxIRsh1=GFyXQ-mn07xCp4R0ey2jWguX62H1zrM6lUhGysBy zrV%sGfaJ>n%M-Raso)MO|^HyIBg8$vn2(9&Cn7WS%U zR2exRom2@A>=M6mS&_1x=V*2?R0HRc25;C?1LTs^3Sd_dz)}!OE;DX~58JlDKhRTP zeaTu86$2gc(HaogDT8egh?Oc$Mr`XP%{hzZZw+BP2{tMYd`rZ!?0Ux3>_W+B?ld8F zm6+xGguvzn`jzjS>bh;F82!}m|G-Mn^n%BEj&QyZ<&#yW{=V3EB{Bf*RaAqqFQ3b- z+q4l!5;Zv!I%pmyplrI11}d-T8*CLHz$5uPybDuein9Twhk-Y`tGVVe%&z3Ius+RQSfdQ ze4aq`F_xvoy9i%g6I_SpU&yTuH!az%L!-X2%eC2QV{s(m!VPdVG6cKI?G%Gp9b&@F z8BW+UOfHG>1+gM8$PHG49d26r)4_g4)_Q>HO^dk6mn*g!D#fQ^&>v~@Wm|9 zs&uoB_@(bG!LSIrT$JxT5ScWkEUj)ozGBiZ9?Ne@klU!Q(s0t1w1(|BH5ZV_%atb< z)BCm%_^9nZ!>M!77w`(SNZSs&F@@{1M=R{PogNu{u9Pc-&Pba{)I?XR8Fb&* zq>v7shq0~EF>jOlmZta~EA~BxO(9C6jeU{ON-VNxIWBNt;#$A=6nwzc)*b7_-LSUE z1WNOY(7hA21=u9X9z;X(Jz$vvMK(AFR`K&elGTM4_)KdoS`%`(K{kk(eJThfrf{S# z1_eL^p9Cxg{vG&e%XwhnDk6}UI8&jjUSOs7Bk|EF{n+IRIqWfjg3=uX){_hu76b5; z>PGy01ubw{LiZ}z{CF~=FAcs@^yfSJ6l2el{`5*#L4$BQgBYfk;1^4Tyn~YG;wpTJ zMd*v^gs+KEqCQQDF1!8-Ja73;_evRISBu`x_lBZ-mSxXndEDwvX*WN+PMX-z(-ikW zlFZ#KIrsOK-vI#RcjQrg#q|5jAAGBS|M>aS-OVq}&tMtv{{h-;+scaC;+X&XP~XOj zQM#y<8n;8tK08Iq>0BR6w}7(C@bxV*kpA-hef^yu;-%t--jRqdPD;h!E$8fZX#Jjy zCDM(n{+!C+somF=Lp@yf9xxv*x3@Y+T(iXUhehNt0J+E@Z&-kgil>j7UWn7iG(wUU}XwZeWRma<( zD|ZmlokmH% ze+qtEQlSW>)`Ti46dZF{PWfWtjISk>Zh5~#VQnXz+9JQL!MU|nTMaVzXKDR@?%J8?H2mi(&V+E+#3s ziV$!6to_l?CC+}lkDYh-nDcuF?x{TN#>yjb0RKVUF1;Td8Tjb+ojKzh>35omDeZy( z(z7vEkslv}_;{sXv4}s=fkU7*DxP4%b!VRAd{m_Eq}4`#xqtxy{J-yDz;8*Q?{^Im zJ@>jaKcEP%2FYqZMi7J&h?0J96^;Kv41RPp`=e>~o8`7M@PkpxgK@=H<3iG$GH0b$ z=iIIFkiO;5ZT`QQQ35;+&*g-{G4LdCL{tZHYNaZ=YjNP21Y7YSML>D^e*nN~LX z$N!#Fxn&dPGj8o4$B_|o2KGDkjf&tfiVaN%v?nS^WO#oW^Z%(}@ke2|@p()69GOu? za^hi^?UySLhd+3KfF4Q4`shl~?}Y>Yxc|?3e0=qHel||x6)7lVMS8teqxd>@n+rsE za!AotIKXP;z1SUIj@>&<(l zZRYn+u}zMP>{k$12-xh8yb2y&mi*O~3nZI)Xnq#PzvQ zIm61RNGy)0+)&#&cwcS=pl6;+L7;rv<|Bm6l`4~45DsJEBVlro>CG7gG zNuh@?&blrmqJ>mRSqEePu)#i?oj)6R-24K$eOn!Y@kigDoE#)V@R0JlVk-lgKt!ex z%_&-vq^)uLHvi(!zX#oAu1zk!J(RYP3|*Wy&V|gULRLtm0(Z-A-vW)}FEz}!tBn2% z?0nftKY!yjQ2#E*zQ~s5BvoQwbSj&|V~e2Yi*dVBQ^Pe(hmdLZzTXbbeZHKS|SYV@OO-vjpe`_w>??V)s`mjcrl!CE}%V3 zi4dU0yq&b|ucMnk8_V1LoHxD$0Vz9_DI*DN#TzvwtF3tl81-Fpo%rZNafC?Q ztem5ri2lwhPJWVZH95wN)3#qaX8(M0aI;mvy;2*e2$Ax%eQTV8_n*5-eSTvo2+i^lCq4H2&Y()yN4k@1y`<{$#PE38rb-uH9Sym`@n?38HlftN>Ze{Pf zo}Jv#xA6jrR~fCYz^*)})6lK47HQm$D6yClA<_2~JrFQ{zLKvVRuPltZA%KZoma}) zXxIG%5Qv{_L42HtEIUEd&coMtL=vXRia$sjPtwC7zhKM{J1dtOd0aoP$dZrF!-Xk6 zem}|Jb_0>qRQOwYUNZV*_AyS|A^yO?1ct%*6HlC)0xTrF>+H_XxVuw!SDo4Y z^2aN=$t~x;=iKu?&v~B%(SJ^ZEC%uAxIzLXlC5@-ByMx7hd>D-H`_9^>ozn0Q84Wv z=NZk>3YVgQkpsk$bx6QvqoK?;9vzuA5F;agW)PxJC-F!?5{RkAv%gh|_x1J@$b#RX zsWfiJgn&$yLt8YvLqPY=p7z>QWWpHW zF2GlVqz&;j2|CbKAfvgG1qlXciu8zYYT-plo6emCDLi9F$UGsPwJR=5I!K)P`NDqm z{1v+;6}>3Go}e_ym-4S(-uf zbgLA9E{a)59|AJFQ#?-pf+c@u)qQ`jTl)7vsUgGZkj#9b7kwkRt^pRb zCi~AC;{NVX^ouw6*%&|Eq!`lfhexVZtT~f((2l0ky7wB9J96Q34|kg4f1SDhF%Er= zEkRTC7gN&T;X>c~w|}sU{wwbM8{hpyU)W#6QNHCd2u}E|9qB(gKEVlp83X!-4Eb9+ z%eOi) zB?t&XKs=t6Z0wGtvQ?{+>)wF!#rys^HKh#$hZJA0Qs4?gZc3z?36MO&`BWk{#MWd= ziWHt6pwSYK1)IMvD<7(C3rib%VM%l@ZBTjM+UR{;$VVK-1Qk|Qm_Q=G z?78~8-b|pLl(I=|_E@H4s=+n2&D;#4RURY3_#ErO>YBt%QtyRl#z8Hs&Y zvcKtX&)@+%|JVntiUJ;49d6BWmNfb}sL(7?12kikmpMVok2~a}X=k%qe^W_=-a#PP zg8Z(dqrC671|pU<9=YgRtfLSp+J#lbowv;KO`P8YS!?*@?re+)&-o`LMCWVu#XsX2 zF#>=AMbj@}NN2V~m=@eAdi^xW2bB2^(aY(sRq@FJ{7lQp=A4o5zWzEnT|_m!=eA{a zWz<`^|KZ0nNcSv8E}|%`wjI4ukIyC3jRoH61-hX&hxczcrf8o$-dqve{E>u4Kz8P9>m>i$a3gSUe~S@6#)_Y| zx{>;@H%%k3fn$LFlTNjJRY(v1s^A>HEOGZ}vBHFL#0r@Y)yJ*W@&TM}2G-?zPjTh@ zyg1txVuz#Ia&Hf8J-DI3^{!u_s~Jo-WMk3QQ4vjKXZtj{{Su~AaT7EkjB!Rv0$O{X zH#=`{X+|8?w6#tMV^{rNQn2SO8`fqReg z#bM>ruQ!Zw3pR51j21nM4^%mDGuiJBWx`bXQlR6>pe8CBy`iv}?yZ-F;dl7Uz>h@J zbGhiJ1o5yA6oWACU||+roM6H-0=roQw$7KMYr*1uo_F(=+YM# zdQ1Dt;@CML!JPi<`^#0Cm5clQBpScpE04?Hvbd(iRij^85*t;>GgGzB@9EK`4+*i;_vM{Y`Z%Vh6iTc-BC2f8~y#i>2 zx3942lP%@L{zF#W<#&&c^dRxVi!lMPI*uw`I3qV9kqD!A> zobFK5Ud-z*HY4i!WPE9P8n28BK@`De#xMIpqR4!}BG8VoVUKO!>Pguc$>10*%JcgI z<$bsEY|-K&oXhzEoaLqL##hm#WQ>E2M-7%i_;Y!sRrwR~Rj)lAuEuLUMOPeeU5Gx1 z*A``qqor799D@bY(L>GkQ4e%fcILXLgFi-=CfUo{JGU!vNxAe?yFc=zYS4*GFHN^h zZ?g5cJ(TK6rryJne`x@jkG036G?z!?y=>aLmF%C^&0MGh`xE=oX6vgcU3V1@FRYw3 z^9*W|fCMb)`0C8d)toq2%fGZnw6&I}^L7FfAh>w{tZzl1Qf-W78`0XQ1azq1Hx zgt0B=u4`Y)V&y?9hsMU-P~%T)T+0A z$0_PgvpDME&ymSc`AV`&K`4YcU+yQ}^t4p%~tl5R7tn1mz<7WCsT znb{JHvs-JvWvz{-r}W^KbiAt221Ub9lsvq$n~dZWInpE{T)|%Z=6+O5Ib)^|pgH$w zGZ2D;?H-b#dwtL7qVc?WqC@T6ydS%j{!)OOk=}Kql7L4;PKUdX3zE*zI}y7KhkRJh z$jO+KVJWRf3NRs?;QZH~_Dh%Dq-sm^Z(Y#BAKwGLf%MnRz25_!F$@aKKh=ZZ*X2S) z8_ixeSi3^z&bVu1nA^T{a{EbaePxlY9u_|C3%=%kVpG>ye#qQbuYs4k*E}+@LDW=+ z3k&ttR4FJC7Z=D{SV;?`w>jG6xOGF87z))fsjEKGhN>%^RXW7{HXGdjB(&gA)5Yn6 z2}qa=La^ud+8!t@0qm`)!Qs68mN%*P^c3|A=%WJ~I!|tHKOULj5ezB@a=XC)qDs!6 z=jf|lp*^E@V?O#tuQUu(45RT=R3z_92P10P7?zDw$5oV)3ST42tYQ~b&jz_gfrZ$2 zN16loOL}=kj%MqI4IB?jru`_fS#Q0Toz{nA&g!yuIU*u|GB;~hoDW^X6tYpG*sY4S!P6S06y>Bn;1 ztM@KMF1dGgtQ?DK(bSS*!M05_A=t`yocH-!_>>#T6W_TP*kV@Kx{>EGW`gsaOI*i* zS8Q@5i+7Is9y8hlwa=zvpN+5j<`@`?Hsx=ap0Ku%8*CICKE?1@X(8m^Wo-U;XD2**598g$TgVmd zrn*JXshd@u~tA1;EgfSlC| z^28b@r~(|;PZps(>{&}+tu_*yZ%m%y!0ZR)G($C{r!vRmk5BU0I`LQwdBgC~-m{?Mzcr$y6PC@{g5nK!v9jsa$A5N=o zyuAi{q&EI|RUyeUwprqIz;T5>LFG5>?T+%5X|SYu>COYUtiUu<=g}0h{2aE=RkA5L z!WV@qyOP)=6SHZjI&wwR@6{YjH))jk0ZLl?l@_i6n$%Y}VjWf~(!~H(x5ye> zOlEy7{YAzDyRV}n#Pkk+4(q#oMZH{sa}98pp|-~jCv#p)u;lE|BxXvTpkP+(lRBAY zFj7Y!%UW4=Tg&fLv=TAYjJC;#zcOKB+&!Bn?Frl($-kk=aNExu?{=*Jm{Fut#S;^E zbDR)HCvx0`$ar`*_JBA_D4dfy;UcxL|E?PCh^H;uB&bRq%AaeVF!~8GsjEnXkS68y zW9)E-5EbW$_ncl=c)+$`FZINo@l4*ktJ1;=es`0w<<<@vo;eK_xuLeSxYv)T&fNXT zC>FK!hPDWc^tIb!%nv9*PQUI}u;D%Q%M-0HR~miqMHrtW5??#w9IW-ZCD3Q$EufVq z4Yl)%1r}r+yS)dR+k&l;WGA3JrVPb_=f&ZnyeuBDh?Bn2Goy25Ww5yuEyVCRDJrtl zwzJA8MQORRvBIx%eCw79<8V%Gry|2-GS|z<9;r}Yz7MY8Eqd3k?KMDf`U34#B1^v6 z;|3+9ndIVRnK0=BM#z*r+TdT})))AE1k|=r>E|izUY3c{;K@ghlL)R4ltSq!3-~cCuK7J z$#rHT^a7lm?WW_o&e8kOXs%ECCc&#@hgk|gy>~dkA`;<{1;V>I_xm8y6ckO~ zEiCysjqh~r-vc>}S`W}m7{1<;AJyY<=M+1yG2|3Er(NaY?U03ItEeu1kW!j3N_Hab z36HkmQwA1FX6OH9WSpO?NOH38xJN=yg;Is>%8gy$SM(Byw-E%GF*|#YJ$P=b%8wu zgwpRfX)KyGN08?QB`Zy+NUwHV@%+#;4=p9rWb<{Jkr+%A1E`8qR z4e{-lItLECzDlA&GH2}I?zAJXZ;AJUEgZ~yI6;Dos;t>wpvT|mJGI-m2g;qdc7TqW zBwIGh$n+GWNc86p@^!OiG8(`HbfLn>+2I{6W2#aGmw1c+HM3N*7kZYM|Tps{S)H{n>@Mt(otG{w_S}S40h$m1r|~mEgdF7lZmE=w}P6%hfuS8mWah+NxSRQ z!K7~%T5l@!`Wr?xdgVLh`W#EQ5#W@c60uD;y~`;{^O(vh1~)fA`L}l_%yP@mB(lJ+ z1-rA}A(rbgix@XC_lm7Ec7xEs2J~vyc=!f#S`Z{Da$ab~iSw5|wXZ_&nCF{jY#2XW zA;GBh)?#XK!4*rpo}c4U+!UT1P*@HChORU0WL`uIfA*NFVUR%*!hhM9H~JLWM#nk~#eoGi(#FlPWV3XE=yE8za$;sS!Yec23^4jzBpZ687Xppuz|tc zL7pTzR~fSuU?s8#3TER*E|BdUzCAAQeDKYo{25hm3Z1Z16%Q?XW`KnGH~>I?b2fxf z81t)qdH(s*WkeCU`C-Hn(F75A=*zCDOs+QaO_$27wS4)MfYdT+?TGUjRKR+?Wix`M zS7Zkw(cgJl_{~(x{Qe7S#CJVeb22c9=N>s&Vap9fI&S z&$r!(?T|-3Bcj~L=`_pNo%PawWeQ(HTRo~-pH$ep-ffR{3}8 zKvN6$L}V#|tE0KXPX?(C+$=<{wN6rTTo?;#@i}-?E$JL>K9Ks7&xx7PmBiblyDZ{e zn1eG`uPt(B$Xs)lU`wfH%HSIg+o4n&jjnvLSU9!|(O!039%~oFG-ea|M=_rsae%=L zQ-+7x|yT(57s#`6`w+Y!oG?q_f@+q!w;DMB! zd#rv(D?90yp8s4I%S8A=(92CceZK|v?XCP4%XmTVyH$7_M|Yu}@{!)~zL{S2KAF1i zym4a%mvJ>ltYwRSAbk9}fy#`-hd0;LuF;$szO=v1Q(DM&fIBs}Aw6URVvX;D@7SYC zyi}%df2skCNf*Th=^<~gkMK^Y)R!06hmV`7agZF@J%xB| zS8ZJUK(HxBAy6u4M~6(deu zvF$hyqZBsIm*Xsk%c2ZkGa49#ystUNwco7@X!$>u0=chxo;K}-%^u4Hp@8uz`07a79(-(F2m^VWxno|Dl$ zJ<_?D$;y*>tcow!Gf3xMCh`@cn>FYBv#MUza&wL)>V%+ls47iN%nD2O94ar6KD2lz zSGR6EJ>ZG4UHkHrF!2M!&XSK0m}A8o#OY9!$1e0QglZ^8DTtW%1t0DhQDUXEEy_@2 zOmVvu$a_I1FzUlr1UzyQDjtM5fo1b5-aX39s-haB+PYeH=>BHits73E{*ijy8Hrhg zIHTq1jojH$8<*meC>M!ndnsF|lV*#Dpd>PT0c+;eQzy%4U(3gI1djD(R`?`HC ze#i1w7lcp==45B0{U}3a;OGhuyzEJ+;&a8JR*pQ0i8;etjfwPb%71ipS z$G7*ihcT?GZ-OZOKAoF0ejum4?ocjZWunSv{WD-klblrp1ugQK5HF z4b8t!lL&hzKz{zX9?LByRtcg+o0gtrY$EKoNVV1n>kWT*)vSX$@llA8SMq4dN7B#% z1N0n>S~a5{&sY}7_dbL2?U^m}D@Sx6DQHN`lr#Z7jq2YI2f9n8eS-3WF z_FSt6hfIS=Wg1;c2#m;#OguQ$OK9nVdW4TxVh=5R>~n~dFlmf9GCNS}rycg;N!{tV zu$r?6S@tVV^yvsaFlIdvnTtQ`O))?B*@N0FO2;5BEypF2>2@eC{K*)(9;b0 z3k@}#mL*UMT0Yf->xU+)EXBi5RKVG2O7B`3pEz}vp-#Sw*q#I4U&+KH>NGs&2lk{} zY8cRs>nw4MWvZU?pgm7Y_GY@Ai*cbzyoHUprcd;_12xutLBQ?djeV4koGq6UyjmDI zh~4rpq&)8*y=dgq#f>P&rlY;Nkf#?!M$RJDOKtBzx;0R?WfUP(t815ry$PoB6jMFg za26iPF6empwa|I49?)SQ$x|IV5$m zz+!v!N*6UR2n0sJqHVBd<4x^M(>P8Up5BA$FYlGpn71&_J70Li8VCYMLr>D`_U|f{ z#>_a@&`L?a^o(lI>5o&*ALLR=kBn;9st+)IQAkvlOQA8;xPhQWMm3=KWevW%Xk}?y zDn}M*xHcC)k*PnMl~%0k!Q7WQbm@&~JUN;w;yApgq>@5d`E`gopJF)cgE|(_!uy1C z+XJTPF4EGHu}ewUmFW~U&JZbko;(ve(R*liCOIc(|LAlgc0a0l+vCGpu5)ug1uybw zWLsV?N8&h`j}6py1FB_0C3X6hl^+*|bt>I*(qL4jf@D=49ySl<;dE5cU(@G78JilY zu)rFtkEvrw?oAdT7x3NA7Gd=c+GA{9HxFp{RY|`z#IZ+Q3>c=V8#+rj zOPlO|NQtqI7+4!ie3b8WBZ3Wk514sT)@;0@d-Rq(``*=JH#RfN10&CZ$1FkO(Dhkf zyYvtQT*yZ}UCnmvAqDTYFwI*F_Gd&y#S3+lEPmS4qcqE?2pnUUO2!9$ykn=g=`{8Es<+>(qvni7Q+`d_MsB%tC>PnWUH?Vqq**yG&5<}d^n}(#w`C+*xXN5 zVrE_7qt+`hpwxFDIv@`vXmsIh@q9exU*o224|lr#9WQu&KG`guzO27F)9OZudqnFg zE$4BSJ&@}QH&&Qf=S&&%A(C-p}kVoo>Nor-`82#91nnX>z|(?}8Bs-d?rK{Q$o zmRsIUcsO1rq8Qr_59eTB1kVY_H|{El=E^DSL~u(Rb#az1$A(prWf_6Vo*SvCmf6)0 zj^E+?^dfqa@+@-#I#=KGbmJ&5hBc#TLZLK}{2~({DW$w@CnXu^Sm=w#P9f_|Jh}Wk zME!Wn1RuTS&V00;@YNv2;vJO+Ns)W)muvYV&?jCR4&l_00`!W={^sr8Q>_j!UUo2T zP?X={gpLhlsw?rCw0jJ-PROu%p2TsOItk3bmC5w1%XH%mAUbzV%{VP;X#o(B))sfx z0N(D8>i=G(=o|b^pzHsGF#KC#`aj(NZKTfsfYQ(^nQy_8*JgK?9t#@TY88Yak50t3I=)F6WVW_G=K#|L!sV zAycJ5i>qUfep(UBot@kz?eOk!yn*0NsG0H=KLR76z=OMiYBr;E5N_jC)vc=I=y;hG#p4B)!<163 zWP=U8Liye)!g6o0tIyaP}J%y1Skj+u7`}XViIxNvnHF%Knckgzpow*kx2$mnc|E3 zfnGxDJBeEfK>rTXrBI*|+<*qq>mvvURMgz}XojuMWy4Snc+tf*^&d__wVnXSZASo2 zHQSfJyb1*z2SOM)nvn_~560wsq(o#XY(0g&TCbVz^_#|9-iOsoPC&NSdux^e43z21 ztMjg=@x2IXZ`ifW1#T;K6S6ABfu}|>0!>Bo0I~>1h3)>k>$4r1Nu8y}TY%@J9k$lN zKo!HY->?z*TEwc*9_acUtSf7`3)cl$>}PifbUOL@;FFv9pJ#GfzXPf)|?Sx}TU5N<7TwxirX4{j~E zE~e%DJ8miqI+D3MqbjzFFC-s$dMQqW+*-2pZ>5BMO1>zfo#J&q{FVV&{c1+c2SI;1 z8l9Gee>dOyZ~L8(HgLOvfDqBwksbOy$1!eN=D-8Jk2jX~Kqa;35xd43uLgzhw7cBlDbk>i)Ox4%Zt6ZZY@*ox4(;rk_vzbBz2 zeCH2;9wbd#>T*`*-*fguJ8!ZdG6mTK+3$gNbgJB|AC4*DTJ}H-s6CJkl4*EDf;;8z z-M`Kfh<}_u*c|(DMiKSi3NA;Z75d{;gRuR&O5^|E?Mdk1@rRQ;zeWBK_J3ajjzG5k zoZKS(|F@;$|JQ9qgnr4LQhfHi&JH(D`-4}(dmuP$H^*4=3pCc6!rq&eMqTztJa|7_ zv}odJ)H1$=?33p8-A@pzIi-0KgOm_n`R|$P5zh5>`c3^e*pWa;5QvTM-aF>k2!TI( zAAykg35+HH^8{NE2nhlqK_Fqi7y|@C;tvPF_Yp{lKh-lJ5E5UtZRr1ZF#mty{t1Kx zfsptUtho6zkuC&6;(r}rC*0q^MzRnHiT?(WoN$fbj}Rdc5(GkmKuG*-3=jwj0tina zVtyI}1VZ8qF#5aJvJi-XKc(#ngv6f;3@4Bi-*I{KFAzZlLV`d@0KGE)7l?$fbq@qW zfTJov=B&t! zbm?niOlU~B-FfRX*r%B4192}3>8I7^q8I_NBvU)QfLtJ!_5S_Z!4Ucs^I$H|4*f&$1m5okxUwT*0BzN|j z--qZy;?_OCR3F#{;OjlQMOce1Ip(_Gc{v0}A0~X51|bi;HT@{lk+Ya(I`F*Q*_JSk zIb{opX8LP>=fiKLU{USPpth5RP{4Zx{t1B)a0l-gmGuZ`IUR&lIf2P8?25P((dxI3{a-1{(5l zDe<21cCE!o^>*O!UVE{{Vfo2k^=oXHS^bTn_hpF2Wp9Th%t~zOSF0u^fa|q`rVxH% z?-!eUvy+_yw^w(U<@4rOlVN6u8x!$^SguZ?I`>_d4kliKrzDYt7H%Bocw33qt;Y`U zeHY!xqr!b#52IoegGf4jS@xbWcWmp=*_RS(VfGxX)s>|A*zaN|tQbFX)p^m)aXq1` z1FM_BR<%?+X{n$_n*CAkxPvM*KD;i)={qkcB-o6{{_v#;ClyIWcPvC>`UPt(L??>B zR!iSHoOd@AB^Zsu)oDM(J;J&V*m~jVi}x$NzPcYFt2)L94MhhNqFJTX0X;FsAMJKK zR`x{l@OLjpfa8MAh~r$BVA(nlAfu@`fktQ7fyIjqkU}&jzs_IlnlJI7_lPv)uUF z1^Qx_-t@`4aeTdzsLuRx(}Dc)^UN8Sbjmg6VzCJpGw&Pk??)V(G%{p8{O%lMKDKZs zwGpuu$7b;S@K-Uz&7Rac-#dvge)YpcI~BC;xJQMZKbuj>`0m=OQ=8kA$uKtX8p&82 zmHTIS6Dblg*5<<46zqJJAhhZaB&0ju zRIBH&UB*lm!M-hXTxh4EwSC!DP3SdAp-cm7=oXC*IjDPNS#wDqz!e3QPiMRHY;)0# zi@wi?T8$rOMLpsd4p~2F)F9`w$xdQKupdqu=VMtgkYtq`S1L^xvSf zuiTS%?TTM7e8b;@X<&AqMb35}{XlFeCe?Nw{IHDrC@sE1m^Ek;u&S}Ek=D(xOFG-T z>VC3cO{NI;&YFu4=GHi{>hUvR)<*iN&`l{vZj8`}cGWbXdee z$Guak<}#|J5>dUg5zYt6Jt*M>b37@mK1ywKTkpG&r^f+lN_1x-IYjDEpBnmSh{YA@ z6s*Wc>~JOu{jjtr>T}0Jv1;P0El%D?qCbQnEt|ernEXsDCVrSjTH$G1+4cgQ=iSxI zU3-zuUL3aT6x~FtzKeK>gCp!Eoh~y z0k~6NZGWt+K^JCC0g z>y=3m{jY5i2@W)ahI?bl52=+Oe>zj6PD_}&C_t^OeX9uw($nvWmU-cX{F6RgKeW7z zv3rLjudiqQkqzeh)MhiPMx=wQy$2mBtZztFb2WZ`HD+*$QQuQD_8-}_ej~lUQ{-qWRYfv}GUIWw`cu(Kq(pd*#$l?+ zO334i@6~g=?<4Bml0HzK?N9P@$l6;?sv4@^d%Nm}v`^w>e%Vq+Ps+#5s;QLGWJ=<0 z%=N2mHFxiUH&-qSimuwVd^oJl*w~+(ivC#cz3;IRY#RO1g_S^@zPVlQz34NoueDO( z8v=bsBS8?|XxOk9ql$)E((L!`wQLlQdmijW6hqvuS0@^z`8Bd^#!$6asXwPCz-)#l zye6u*BG4LJMu|c1Q2)!o22D*b)n?Ucq_?N&HIjNmeR>z1q%0kwV)lDDD1{Zwa0fDR zor5GYO=DPLxa!!mafA=TQKBiIJ=I|^sP5p*?|#o3vK{?-!?NPq<3g&wR__fhb;N~W z&?vSGJrxH5t{E4@DFKJ%;l>N;zSAo`6ia%flQXrK&6=7(( zok)n}_Q$;$L2sqA*Mt}V9>W1ks&U_m9jkJQW07$Fn33c)a(}byL5u7eknw;FKn%uo z#(EHKU_sc%&#pDGiS0~yOC!iYLKFaVEi4dp860OJ3Lh~W$Ttdso%o_k(lkO5QEJ)A zx-sh5$+-iS+@Jq!F;9S4pVHG+!^)#s|yzg#AU)$?C+tcDM88P z$CLa|`jk*KV-1i!o9s)YEML#p7A+`7#UQX;&6L2E`0^VwqWPNArrXu(gY~xA>4?Yb zQEF{k9L{^AqQ#kGfr)JGXI4aWDhMHEJVCv}<|gGf65gC8L61)z3q2wfyd#5e`^Jr! zY4gLgr+aF!nlmcM&y4Ku;!lLWnM?8>XLk60A&aO$nDM7(5?^hV-`Fy59k7ZdK0-IkQ^t>L=6?s*Um@ZizatJ|s#p+4a^z*$w=f_feW-soAfv*GOuY{J7Gi48F(#R`HLMM_|ym_w{OVv%b52Wyx=!MT+>)O5f9xcUQa_T*QN~-Q_lh` zgGVc>Q$>?M0@xD6&$}D<-C{|7Bl*0Fd~oRNK<(g(@dSl@`~f`1&U5*er+QMkc?FB# zER)1b!d8zBG_k;Z1l#U>Zkx!ijV3UbkAF)Nl+i7L+c3G5YrU_^+ry#j>ULM(l&N?_ zOP{+3;Z;^%dJ}Z_Ru0j1@j$lLz7tWKGInGHUp8I}VR9~@rg$q6gg+;HKQCv%&j{5o@yEC*)mZ}^aP70-6zGZ(X4RT9=X=h7pY}qrr2o1 zrSY3;7Z=Y01k<`meRevVvhFvIGiTHtPrh@2{SJ`p(CvI_L`r0ILRVzGj&I&B=k~(d zjy{9;XaQ03uQ3CGF|7#WtiKuxeDlVal2_HIoM}>m*@ab;9NMmwo%5>M9H}917-++q z>bd%gQ`eynJx&xQolYE3jjJgJh`*hvvZ;@=Kh5Cn1 z&)4Mn{)|dWyYGTA&_)w&pVxxwf9+i0EUQ_nNl+&k&f|PmwiD#tmTQ*>zVbg9D#E{% zFmmx8luCB+-Hvwn*eiS6MvlDy`o&nTYpMILKi)8e`%YpLBs!w_q3!jw(zXL=1L#NC z37O(6+eyBaLK(i#_)<<~(S?6@pyP#7^uIZ;*|9uUHMl%pBN3L>5hoTG#3UA5wNgoi zIr&OzSwH_~((Qz$0~QJvRZf{}`d2x1KY^0R58CPu-n-7AAmK-E7QgU(b^G;|JPZt) zZ;h-G&JZSQZt5Y@+`-F?>7l~r?}*i{CttY%;SjWWMFs7&FTFYp_S7}cRkLS#h^ zzyCq%J?w<=sXwAB83-eb27%nzA)rvE|Jje){a|n-(a!dc4j~mWmot}M8Ck3!cBNUk z6X5avR8Nx(jCGOY{#UZ9ue4Op_Bgar2PLic@o5g~Y|OP+n`3P5if3zGfS-Evnk2@L zCom&+QLN&f?Gh!h3z?r_N?!YOpSNH5Z@mj_l<;{2lRXm(2EAhKh|cLyfj;YBe573*h|SzI1J4Obrzi}!kMMfDo*Q7#CbJR8Erm*})>PJLvXBd}5(VdA^`u}##J&vKnDzl8kM6LfRh%uz$Qu6mN3o``|xdiXTvmauVwGEu*?33 zJMZ4!U5%LexLUij5TB*wsqldplHlroXUK>xsZ0EqKPNu&a*44-#m0)qO_9}9$=>y0 zqMWQvJk(2Xo?Y} zpY_B^IS*tiLO=8<7-0& zTRy(O=BHofoTp#->c=w^zp*&cZqUb|hIPk$AaRU|?i90J-p4MfCIVFPrT z-I%pq9nrb?<=W5W^C-p+tqolkP%%h2VmEgly*b^~X_b*+5yhwf%&y|cnPTgH-eQL-g<|uzs8p6d z*8JLi>mI2ew`H01M@etKNOK#EIJTSChPH#6bQ~I`n4Xe8WsmIuf3Uf zm#prUF3YDBaaiy|H_mq8LKZ)mZGFc7dV3@I776E_dsl2TDBw8i&=iyJwUyK2d@ht6 zuz7@*C8mc{)YDXuL%;6JU)mj_dB~Qbx?s9oC_(&uVuMv?;o8g!Y2&PADee=0rXfF5r%lyjN>kAbk zaBcE_c_d}OPIuAs7z;AE%c7$5%Ba;!-O*v#C+8ylayx@t&hdv*2fymyIqi9q&j5&? zjb~@f#b|caeYIvbC4g0V_H*r}Y;pjHW3u943l)wH^pMkJ^4-M8+1@%&##_0KR ze9X|D<>838I)<25I-^8?P7aMiKMvl@E&Tvo4WDlxiFhn!mFPMfW0o{kG`k6;7rV{GC0XU#GR@(T1bl)vL!$GXP94%U%CuzpxMvHFz>cPVqSk^KSm;p;9Z#PrhgRFsMNijN-iZ zx~SV+=H0rW*V_9PqaD^&_ufvTbG4E|lZwqwgV)is^(F2-$9D8)9BdLxV zu7oY{pPqRj_C5!TRBXF7-{`_$HImFTPd6leI3+zBl~c7^D{(~vO9Y$5mcaDG zC^@woljqw%P7{QZ_CD-=A(wguQv)4SYYtFbbcU!JB;JvpVou4QGa>`;J`I2li$j3j z`@^;CiI{c!XXS-c%SIIa+!RXce7c!VQLTwtMPaI#Rg?x}VP;+lHM3KfY;*tV<3z55 z0(;gFKL}yxF9uNwR^QsA^w8=qvSK98=t48q7K5jiEsdRGR5~*dGmEduS8pozAyM2We&s1NgBf7-&VmfpA)>_s}M~VD#p?W+c45bFu zgt1ix^i9yQTt2}T5+k~_2DbQaNb=MyJZCJ;JzURG>k~yP6#WVd@%VnwV00kA4&#-a zYNh8A8=-lyicA{eF6@aES^0d`U$RWL<+`?=hwIhL`1gS;f)CNH=niTQ2t%E@69n!Tc13ROxew-V4jq?!Tx(#aqClc27S8L#g(7t z?#`fcl8z4MJoCruwvj;3uv^1*`gDMa%I!dTt`%yQ2rjPCyYZbu=tvz08(svx+vqpG zq{4NIgs+-${nT4*BePCMFPuRH$qgsBK4(TUkuMN3CqnqdLjpq9talI|Jl}C;FsDuipKXB8$1yW(qB*7-7l6>@`G(-)ny?(yC1warocIo_({dAPzx&3rz zKMK$H8gM;^P2*0};CqeL8wsnz0c>6`uXCaAn!npmd_tOFcU_oIHh-gU zGV*ZWZ(2F?9P#ar`vk#oBiN<7-eXZK>5j`75`xgB!o;2<$;~1bNG@-WdIjAYd-?9$ znZWu)4;>5o{E?EZHUx5t>k0?ySc#>cQTUxpH7=%v2Wo;b z-9mYN>D$ZEpVn;|_if%N=j({^tLV=g7VlKgNyc|4fPT+|1mapMhirEY=(^Ml!69l4;&eEy1g2hLeebmJ_v}vnodWn(MwT zi%Z!)c8Eb=2(Z|mHj{pXI1Y8ap~|3 z%|uQcW>_VZR{rp3YFFY)FcYX+E}FhTS3cq8j%b1GWU#o+r?hxNG>*;5;wqx_Swmpt zrOZ>^)h5Dy7W+z<%K7`tZe0z(ItKF^G6~>(rZZckb$c^6C*NZRJX_Cp_(GFy<}({H z6SMpC{jl`Svm`%`DS<7RyhWem-Zv8e5HNzZe_VPS^x-(xlbUj-(P#M#t1E=gG|I#0 z%Xo|i$*-`Hj7&%gLzz(8Q_aq1{h-d^o;%qs)@GeEwFfhTT(pj-Ufqg_1Rp6at3>fF ze0W`*$v;+>hl{|S$Ucccy~befO;fUeOPyApcqXycr=@lBi+;|}U!8_od3&?$U*_2% zB#`}6RT-_DZR{{pdw$GIj~-OD@1`=PpVH{PhsvWvx9Ch-+t9C^G7EhDB3Chg@pUem)EBf3N?hS zp>;J&FLKXhQUa`pf?wS=Z|et8P{iWAVBNtw_T%FEh)q6c-G+0A$lynNZvt+1$>_+< znh|?{qus2c(MDX&h_kQJ&a;#T#Lo2h5T%2Laf|ag2mG>1JtL(y9ROBH$lfT(HL9!C zk`wt=Cz6Q-frkI42>@!N+KP1m^1*(zPMN9u83^8Ib&BQGi2&z4DEE3w4hp6IEw4K> z%1u<|5z_dQ-tavgYW^d@riURA14At+u#l}{V%b@#T9tC_*MY2b8KF_Kd_;TlSc!|17x6c z^-gsJ)Hz5yNYZ5+fNxlx0wGT)4Cy$ zK8vO{P^@@KGs*o_RqctP(+6=Qr2^?eW7tjK9=pEigM&F~F|qNNrt2d?4_!#L$R_DS z2bbzHHKLA@wvJQ!uh0iHx5-(YG{m@)JP<_(u3b`Y-!CYBL@o^={j~$$P%+$W%c?`3 z`HRH$Rnw6Q2UAN`!twR?_f_U4uFeB^EZ z?d9Q%k-TxCG7Uf5`U`A_3QSU;vsQ*+gE&T8V^MdBI>aQ8b)}8nsSqx2@AZfzj_Wyd zNtk+il=~5JCk|&<#qXN$sXD{Wt4OplAYtCc5A+E7^O<^p-C8_JhwX)!Xzb}cXc8Iw zVz)FD>5lFju){rGayTb`Sb3d35Uj3idthSsJ4voJSnI7+E}_GonS(F243Qw!^tm_z z{NW_I^rPCC?3u1x_@r>5el5!(j&3@PmioOk_q`{dJ`?pV!#9H<3G&63s1k$uqE89B z`C~-QBOn0xN`x)gSFE2AHh8_<+muG=m%*66XoCGALsM&Z~xXkC|n{gXc} zUki;DU&_@cXA}RTd*Wzz6+qa+_s07fOG~R%pEC4LrbQc4QlglQ5;^!%Ohjw%Q@ldbRypP(`&)Tw8_@*Hu-8}T|m{YG>3$2Td(ZIHCn*2%gZX2*P9bE#v z==x{ZrBZr^o%YCw&8WmDov^*6L8t8U@utP+OV6ZjGIq>6^V%~1_79+D?w0NAz10tq zFwK4wr9*C7j^1~T10Gij$rCl8Rc)Va zG1uz7`DC@p_|Bhbb7L*M?@_grvo5a2uro=zPNIWU7Ck_AyF)7dP|WUSR+ljA(o3IA zbW{24^iw+b@6hv_UJCuwy1++PbFmPe?lhFW#}rf6e)eIsfwMlp;Kl4bqIt}j?10)Z z#&E#A9yQQ~2kQL;3z2AYcW%8|67O8W@i3{^h%f@+$4s>?va|0B#&Ifcy2fW(0{t2j zO|HtLpG_Sg{J`4RFg=#GlBbUW*sHtEI<&-2!yOm~sawLlZxt`Dq^mQnJb$IyW)PjM z$5VTsCp@a_MMHF;2?=`vY9hff2XihJk3vnR-TF~IE6q{Fvb#(5Gn~%UvQ)O|Yk_hl zZQ&sAS>;DQxziu6L$1jd`F($szZGhPqTlC3h>#t};#M640jkP*jLOmGTe9ya`Z+`8mSH*ASIZc(-8f4Ud_r{`X-(_sE4|j`ZS=ra_%}!sS0Pti( zbjC4Vq>uE8e1&(+Ih$+QE)luD25LoB%q(Uk_iTRB606g- za7#7*RYj?Tx>2KU`oiN0te98;XvoChnaLIqD+%b*&sGt@aB%Az9qkWxv;zA+Hu*IJ zF_mtad1K;_m;9X>aJNV7Ut?I3#TCmP%X&xD`&hhJYj$Fjh|87+_qt5!ILK)4|B_jM z8WuTUyl-`bB5K@bcg$r1PFjRd<1F?E5^BRT7KDS!+==mFhAP5x*Xfl>qHW+&@o+_$ zH#gD@8i67EHl`CeQOBvM&+gRteTO#f$B+-o<=qI!k_qSSs{u(bOpZ z`-FgBt3#Xo@gjf})wAnAU-3`LuQKvjYw!$D+o{yPyKli6wQwg zi7lUL`W94FIV?w{owiktoHe|5>)CEvPC0dkGg0@Jt0OxTIXs7p%v)l_s3DBJkSC15 z9`g;8BFi7kHbj<7V6`d0@03KB{&um$2M_yW`shpBem1-(aHzCpnws`&coN;Pbm9?V zbIbCVg=vq@(e$zXl~S^4q`ieGE6-I}bX;|8WVB3;QnBRd3)Yc?;|CmKfi@+ln+nk% zTSHlQ?nL?MiCp|=&(Zrdho($U(h(zh%GzrIaVHi*2-FDx#;C_2(g-S}G=`7m>?vJF z9TXSoQtS%Qbo&%^<@izokTGEYe_(@Flt6a{%RxA&in!_G<9_d7`o~G--$`=q8-8+& z{Y&;(oUnk~KtGWB#|%V?GIlx|OgVXDWrP1t!{5A_AM~%NU1jjx==KbTpjNCj$3{hy zN}Fzhfcn-Snb#K6bBbv!xYk;%bm`s$H}Ty}+SC0J4fs0?OQR~=k%e>KDNys7 z-y97i(zwj3SUgbzNfeTLPeOF>u2ye^2XNCw@Rbe{cP5hd%O+iFHxiU3J)Xdq%P<2c zKxU`@p3UJKex9DV@1;|KFkiZ9)w4>TP3>faO6%yf^-<`A@o zLdD|5nRAYj;jui25uBT5j^)LWJ)N)7IZv(^_RPqZn&@cYDv{c_Q!>uKL)y)25E3xX zuCLo)%#kon55aCe3MBz6ZI%G}=nMV2IuGDW16uz9`I~_VD_QIXyTE>@O$rwHYnX+SfsPHpzE`!fnt2~IlR1#zcpqN+F%H(om60&C(I?)DI zbYUa4p6=^Z2XtUhT<#+b9o@z9Ui;R^KeQ>1pgz`)perQA6;DEw?}Q+7(l3{t4}gly z;)mCtG~z1Z)oxph*ejrB%0DEIO`Wy|)(=_s!#uIHmwZQh;Ly_25f45)Ov}fkO*d1+ zqx=thc}Di~-V|9HCGf-Xr4M^pF$pHP!=F%B8T~y-Z2TkLtDD4u$&xj16%DxLWxAZf zqoo%C3Xf)b!jF1uM~7d&|7{eO;egSy8Jy6#>yBbO%0+!DV|`zKsE~Z>M(Ve2?=zIA z|IjAz8fTQCQDQhKu|AUR=-xBysX_;dyxzKJ?Sf5_!msY=ds@0d`Op7$6s0zu`DOYE zW@fe{CZE;(?3B^K;JWGYhb~E993U+Z>)Kn6z(S0T-E~zWwcz^3oct#mO z=2D!Lts;Z4UPdY@Yo@xIKL0mQGy^zSJYGiM+dm+*tB~){?1d-6MUPgq13It_G#_ab zYH!9|cFXMObGDaUbKV$(CMlOb$u(e7`+Mk;8Rgx_b`ZNNd>K}+AXDwY`6@wa*_S)3 zp3e7sOW-Lvkt^V&YoR&M%5{FLfVUY{F&(b5|J`J!6_rq!T=om62-m&KiPXDH;Uh@F zRDRg%;q;^OVJ-WI5mg`hlNes-KLwo?2wV7wSi#I~Q=F!~LM_d$4NnsV#{;2}>!SY9_xS@3XRaw=2Yh-x<|c7|#uw)U57x5tj-#QkSY1KyY^ z_F9d0>=28fmNn%+O`DQlF8G_p^FzGxn}HLo&J~O9t6J_6CM%X>t`%)5=faCdw0g<{ z|5$@*(PQXHXA>;r9TbutwXZYgb}@oW!$z_%DK8fAl}(jDo2=V0SMbi8AKKC7fVmdU zRPC%o)WrDeCEhDxloGzvD=F#0c5eWPw~=E;=48%fs&n1#WHe>Hrg zCnt;;jw3#=p*J(nQ5RoN8T38f9LYpRqx9+>U}VLlGX%6er5 zoMY5^U3w}i%SsL}lF4uVw_&N%&nV9?tIpGjBe9Y^t_Vg&v@Iq!S|TB&(F(IH?ChF*#j2vN{i^PO zBqS(36;A&<@w6g1opLE8(`CD98IG&8lSgNZgQ5)Jw{8P0d2rCtQ9{`L1Zm`YE21l4 z_b6KY9)3?~1el?_dikQHiM--0L~(LmLj^7-PMQgAJ+69MnPcy`l3xOi^mGr_M5>#;TG7*1hg**W1kzkvoa0{q_YP--c+*vID+-_sIq z%yjzHK*9IFRe@&| ziG*}j6Em^d?rA{eI>{aXx8>h#M#+ErWIyyKafLOoHZ6mYgswZ>e;6ogH~NvUZbTNJ z!j+ced8wuh&T+f9WWSwSY&10T_B@lfm$q1cAwtuf=RXH*LCHrC0c9h&K*gj_@}%mG z>^5LaDcf<+{yY!_L2v#JF})8li0+6E`#R$G>kvt9Az-6MvM?iuLH0j#2C#7jb7}|n zRY#9MST7<_n2dRK>tCgFHDvashuxa`!CrsPg81Uel8k5aI$2V zX&9=LuF1Q6_K%cAgCOG>|79PqzBHIH=%>vyUZ&6f+cui%Q+Iz-EU3Q^&7KoHvb3Dy z_m2t9!M9!hm=+fSu~;}r%kx$|1;7?yQnn`61t-J!n(I1hHn@Z4Bxli)pU(?md3UIu zYI5Axyqr65Ynn3o-0}XWkmIcNc#34VGS%P<(&7s!sLId^P+5@bX~zc`CX0-ZQShG) zGQBH9h4nzqDegR7^PhmK%5}Yk>tf;4QfqpkG$}+8O(w*S`PokY#aPY~EowUi6z;yL z-nocC0lQu7!GaKIneA;_$<6b}FaN=N@bqFo#7G3#AQE%mUcODrA+*v5xBmGC7&4NN zT}MA^>FsOC$1hfzPP7_b@79*oL4MNY#(*dd(@(RwWT z=6L=ah|)1WNhd>u7Gc|b!!5-8uG8z5PY_^f$K28ntl^@TxddxAH4zj2KrM(4I7$?% z-}a#y*q)s1+AcpG27dLk5MeGzZOt|~#mJ#2?wh9KS4plShFOC8xoMxerq)U^*V!+g zVs5Rei-uVZl>eUJ76KE9>( zB!cV~a|^0wN7*M$NU!o?CeeUUSu4*yd_DEuy_s0dQ?3$KwN{{ZWk=e{dswoWbf0LM z-`*Yy3!+7kg;Im{w&_x~nZ~9Cx;32?-iO|t1rPq_hU5Nz8vPh;TJZuR-Jr$Xd3QrN?o1(`5yLDbNFl8t2>vG65 z(jgRfg-3dek?FoL)c5Q?N0ux^t$cF~7zE;e_1DDz>Q4Eo7)E+yVL-3;jio&Us*e0QBt7My2UJxa_~4 z@XROXta(Gv|6AJpVXbSpe~Tl$dE-Bx{9CfTWC?jiJ5PgvfCmEI{!_NXBee7=?ou|s zSeAcI+kbjqDk{Y>f^Z9P`@fAM!g$N447L{kPr>m9jhlbXouDh37m{#ZzE$P2dK0)s zu)iJ@H?9Ymoz+#>y_skUN&7-}NZJ=_l7XR++{Zl3XrM~`eHU2FizC$S2Kz!?|yPADJXu|L|6<4yu~_HhuD{g=Ja11EO=5eWZpBakNk z-w45Mc6#w08E`Afz}ne)T+Y{cO!TA9>>_kF2GuW}DjNP*d@jBL5x?u=EGjuOM7=kd!5S+Wld0F|fC|r_Oau_juDtulJO9Xe{|@Qd>^hM-UDzklGVvO5{ha5y`Ye!?=Wg~H9!FPzD;-;Q z^~{b%y0;gti^a*#T(r(V&g}qiyrRc%Q+%f=)lWF-OHm3*(4+?rP7S*102beH_r>HZ z*9pRS^x4b6?Nhndw&=Qp8THj;Z#a(U`M{;yWWKwOA`eLE1hhp4I02ANkFyylS%%~x zpjRhJsSZf}BoFy#V@`UJ+v}LPILj!zmFm?3&*A&U#>lx2?#$}F9C3TfXB(PA_I9r+ zqOu(V1VnHW_5sDMd>`Ye1aR4ViR0FY`v#)ag%O*rUcga&NYO^45CTyzK+Y7wWQPFV`C%BF`CaGkT|;? z*(OYv!)m#_7^%BRX*`ZiyG60xo2AH>}*{gB}5Pjmh2sU2Z%x3MU%SL*}t*}c3w zDhb=_Y@1C+<*3zHP`F5EbaWu{i@<^>C*eRhZ$+)4k|o0{!mymU8j!MeO$i|#ac)V% znSZ|IolC9;Zpi#0`jagBru^KipR4CgUB0BTC7+bgnFzkbRg&vTK62?Wc-Wmy{OvhG zd$~(Bc7&0X=haAw*@MyT&SE>?q&!XXT(|u#t7KT(i>PR197-u4cevy5bjyaNKDsiA z(G9i}LXUNy=Dj*?tvSS^$CO&UwlF!h$~UG}kVOq)DrFfQsem38zo0$4GUr47t?xvt zfvz=qF;ZoA0h>c_Ihw8OS%sWmd12BcQCk<`|I4L)oX{m}wkzu`H*Hvsx58Jrwne4m zWM^o1VRBJO9K7vIW$9G>-Y%TwQ@Q$j9`32rms7Y>->Tqh&l`i4_>~BF-)v2pbw5Aj zpiSE$JXa_f$@FEQI29SkOuIZcy1fu;HC7goV@F&X2<}BV`74g}^EV0Dydb9j8m!wG z&+%sRhGxZlPJIE@N^l+PEtbAYtGJBN$HxFR@MkE*vAaOBes88Ft7=!XyEPQVAP8EK@jRjVocN@FE8RFaG(7q?lpPYvAi{#?RjZ44{FA5<*yP;iB&w=pThd3HOv z(Avv#P6^LfvvBc%g{Q(aO=-2l(AB_g>$zZlF|+#Kp{&el+RXl^vI#a){>DoUbLs+(4l5sSfJDp z_2rIBIZ-#R9a@%M#lO|6K6tw(KecJAjPI3e~7z~dS%x7)^TZ=w0Al344M2|C7+1I4SHP{w&nF3N$+>gf#SR~ysX{p z)NCc_3+>?0p|}sY4af2~@$R=2UlrkCh9;Qb?hTOFR%VyRb7p+5+pK+wwJuRzQ5c;dGf*t7y_ zXfN!m9<8+vNZy3ty_)G)HS~2(V61sZE)atRhEI%9*W6IcGf|3XDK8zqNuIV{G0>P; znKDKgO+IB{Svh{7onAb}AyMXC@Enz~_i}1~oc!$S{0gEZ0^ONhDY3Oc7>#VHvBSa5 zoA$&SEubscmIiEaeU)2{)G%5l#oipn@YlHK68eLAMrH0_5(QG+Tn5sLvAs*&V~jIK z<@h@XM=)wzC@!j{a&1Zax#aiq&h5GQwY&m5DCffg_`pnXK5lz=g+Lmy^w&hdS+&n5 zb_VWk`TMES_kzj|HsNCZq($sMGt+;Ww*I$h41L;y;=PN8@sQR(ph&gjwLR&haO~6O zexP_n839I-Qntor*BbM_cKCSp@b@C%I1~q|{DUil3w25na))c&UCP5#J)=cIDU7~kCbPY(n$$&Z z-K^(XTDw(x;Sl8>1-+=coH;n9lGMZRI~{_ObDG`UTtNG^ECs!WncM~k(BUj2*R{cq z4`=000T3RzsQdExdi8lZ%5$U(-N03HEpb%7V~h{@Z}FO;JT>a`5r$%4iyR|4gDA) zK)N96MrFt&zgE(*K@cd&qBBwQqNFcS69bl(p8@IaJnS?C?i_(YUC*8IBj(#HW5Asv z;OnK=U`WJyKGIqobhX2=Q!?O>$7ILjXt3*|Zz8oM{uuPk|8d6q{|r9<|KW#bMTo6W adq)!2)tXP{m?i<*04d3Nwy zndL5w3n(r)jVL84H7*#qq==-53J8b@9ITo5ojL#OoZt07=Xd`9_d4fz;ib>>@I2r9 zdq4O0UOxBzz0)pux^4Jk#}`^!S{u%uIeAe_>oXlKtxt6SzFu=?-O->QG=DyUUv&Fc z3*TorqdE9A@`T3;Ev-7tm#ddQ(;R<(<4h1-OG~G7?e`NJwBkE0Ew}H^o;=~35GEL1 zzyKu(8BD!B`5w#k{>nG^#oy%q^ki2_<3p#rn{V#kZTNI#kB7yB(-o&STjcyiE@dC? zI z&;@LMgE;wYo^*9tkX^-XBRS^U!YaOj{r$tW+khLBuMxGuecdrLR4=1(E?e$WPPPrRW zgxv<58SRfJw`9i#?$J5+g{e+}zkkoO^Lahh1WIrif}11C#;tz0%|#`|eE!2XfB)y_ ziTFnHIr2en&BNABvLC2B|4jFZjXozPfBF*#G(%N>13U8`5SmqP^6^rjd(UMDoF6?7;&;}C*r>S$n@(^v+Y0qaKmvBm3legW#=<5LzPCX zjl5hwKIi904?B0khEM$T$Mc^qq>uXksMj$>OEq$KWnWYhik4G#2 zpk8)6F}d;M5$fCjk>RcdD4aYt(hbF@|8@8ARu=F24}sH>?@kB=dr**Hap_*yji#TUUk>;Qxn0OSIilG^NAobA+f%xbN zbNDL*&1|j+^-{g^TwCLFLiyd@`nP{)L!tMd=n8D3PMqhab-MN3A8c-zLoeX#`RxjN zu|C`6pclwEAb-$uv4TBn_xBw6xq7Z;f_iDD=XdT$pZ?SEe=t)Jg)9I3 zN`Cm*+N)^SynhCm`%0-zr|1IxPNNlzRrRtvY`Ux3QJM5gLY?5oPBWxBqy-CY^~^IN zsgBwA&Zq@anuthVeJua z8o}^@iF$Oj^B*HBg8RO|mFuT2JS<=E|LsOgG(vvzZAX%Z5}9tUmF8v#`0PoCSiVh! z%=ay9a?#53p}t*5`J5sf-w`{;U7|S!A()5y zp=YVpJrn|PMt0zXw6mW{5s7A;Ngv(3V*oM%d4qwikKAao`l6P*EaOMqiZ3!twb6F# z%Nzo^1Z1H^L$S-Rp83xAgmd%5xkFyYmR^*L<1Tp#^82wjZmeo|#Me>K-(UH_dZmY9 zn57po>egFg652s1O6aembG-%ZxKPCVUlaO|dm2)fa=i5xS~?Q(?#Wh*;9LtTb|>9Yv=&-JfU6rO!S(VQg7m59awNvE?TnF z|1g>K63t}5SDqEMs=Mn0UjFK3e2`{xETj(*S8u59xb-l#Bg72Ng-z5Q>!~$y@bB}J z_PSi0T;yhR-wh_wpA$frcL{zY$IfA(=C}S4Z<(yT^fpQIJXXZ@ji*n3>mZM8%0=mB zU?OtAf!QBfGj~w^)UCz`nI3z%4}>bPYMiBdWRK+KyGATrQjfLbJkNGMr$9%8;_Z|c zRIyu+V!dR3HBZ>0pUMOr&H2iFq3A#pcpT*9Ja)*_Fii2IXfol;HJQ02)T~hGshXng zNZZkdCd1X%F9((EO}wo4A(^QPjiRf1bpG2IecHE^H?FQNP0BtUD|P?#dN5Pbb6?Ea z;Da?wU8?>Smv}`IlHzHYJYmLpI)URtzs_=!1+4f_JdQbEql~IU`@Mbn{92n;25QNI z7Rs$M(8z4MUeW z#mc^q6%3}V1b{+xMo?A21h8);x66mSXrf?ZWw~HzQgabHRMD;6(1htcTbmw9w`P*B zg5+~4Jo-J_p-pDoA8ERIjO3Hy+aDS&p$3hPUbt;kNNT>p9=MRvyFCY{&`ymi3=~d}4l*#!4e86~sa`IaSlwD38+$I&HpQpcW+8TKOu>>3EKrT{ zd@_?*>ex!3pFoGz6P%7p%=&hFjn^totU${{`LSg?v5GpMWX{xM75w3W{xnyFAjGXl zHU5ThQCF&zQexlptsWd-X+7CiVdi;Zk%5gvz$nX077dYo*zFJ7GpHfDuFU|rJ#dm&}Y<%*`hJOEEk(cttLfg zG&K-1XXe&X4I{6PiJp5ITSTI7)U_=RUyC7_Fcc>%7y(%G)SKoqCo-#C|9kCum!Q7t z%ziie_s>`7+67b8)vquz7j3#DA%(ncpElFMtCTz*60=HoTd%Ns2nRXZyL%2XI5{7U zUSgsBszMpj4mC7JEIb~=6$N?p)b2A-D`#TQs4IK4b#hG)zU<_wFb@(H^wPS{(*|CK zEHI;Je`?G&CG>TYdX_BoUt6iaJgG!sDWXTtF#G#OEAwyaZ!sj?;RZ0b4?VL|&zmjH z0l65X>95mTPfUUQhQ?Re4ZH^K)GS&rwz6oY|HO9F2G*{<($f$cY0T1URgjqi(Wak8 z@n6%#$L+B>-akMVlm)3I>O=xiVPa`jBeQ5Zim#NMaNnbo3U6sLOpJQC$FMtuN3V0) zwscFD-ANs-Ko-fqK5iK#?6p$tG z>((AYd8FL<-L!t{ZmY~JsI27M*$>Xk79W9lDTnqYcUk2e!4R0zpUtXH9NpLUa8Hl5 z=XweIi(xUDj1hqoW8)glez4>Wl>+*mwc9{`lVK)y_fF7Ibr)HyFV7pEPIAC9X z8WJ+O2z01|TG{UTmDNoQo4oPxp#cWg=3*LZnpIkluEp*))iLOHQTfeIH0`EwMr0jJ zCew3fx@c(aNFwig;yhd(V(@WNkjd9`o~9S&AdXR7i;f+X{z&Zf-nOZT;x9` zo@6hwVTzu*NXrfA&*Clod)Q`?^pa}XL;eCFC@dkU;~w&IH`H*<+fx-8wQzA+I%@Ba z{S7@Gi*i^RejiX0kbKC2;>}L&$u= zqR$%+b<~vyLV^6fI&2E;JxELuQ7^OYbZCOaqLuE6<$!W8C9gs=^)%{bR8h$GJvvD_ z_J#@aHzMJ<@f=%;tyj2s+QDO^X1-TgfyXC^Q319JF)xTi(-&pL9JuOZ$y zFmpy*&w%7))vU6d5t&Lw8y_MFBSR*{aszt$a1+GeRnw{x52B}oVEZ6%tGDz zxg*ye=n#-K3mqMey}4?qTxt>-Te_(c7p5m)UY7GExGUEb%L|Jhdvq28&?!c%7wA>+UHJ(?&neBn+*fh^Kiq`sX+<%wF_u}URZKTk)g`sIZ>GTAV762 zX#%=D3Xfm9wbD&%kQIc`-;}S1)M&1+<1Q~k?MZue#x3)VJHj8mYZY3Vs=Fr=Il+Z_ z+1wu^8M*MSKD#cQ{q=bfUYY5t7IdMw!^G=jW+(az_Cf7M7$}|p}ig7x=qZ}x! zw0d|X=T{=NWEt()_p*~d?&2NOad~0?_x5+I0Y4KA+tls|0lNFMJGKw(+kjY#`qEq3 z_ix5+Mu_}1w>E|Q<#?6x_799dq47JRL2Yq$MX1HB*h|YU69?tu=k*KOw;pDndpf9> zdT+UdKmRra-_Fg7T}@hHV$F4VQ$uL%!COj#cfE*ZmHFVD*|@IS zr*1{f>Eyl~{50q;lc9O1203g+QVPJ!#mO3AO7O%w0Cm1XdjzW+7G4t7Z<#bP(<_&Q zo{iVZP3_=Afh1F%&09SQ%7vFn=!N5&Wevt3!M|x6aV#K!=l7S#PF{$xECwlXRjKgA z3MWxJ5&A=n6U=@apK~k6fX2J>=0M4P#Lc|2^1v z9Qq#x8-Kq4e_0^*M?BeBjl%A`8}ZQv`S*)PwO;>!8vOp-4fjNr)~R+30-)G058H^u zvK(g|I=StzK+ae=AS*T@X#Bh`VxKcgvD#SFCg|r*p)&)_l2?Yy=~x#0@3|O!zjM;i zm40TIm`iaMFh|XoY7{dx(sYku7*9CdZ5<6_OeW16WbF9T3RIevD+?%A)V5>|y>BNT z&a!Bpuj~AZ@YaLUu{hTF^;#UPRCiximL#`Rzgb%tiB~S`r?Su>GC}129Nw*0z>aeq zI;%bxB|6cj9#Xq)Vq>Y74=pe^Y+;FU>ZNza$M`n(inz?;6b(A?MoqmexIp#vIuC{GSWNY+2K z!z@Z3_Jc5v*zFt1W>c9mUB; zD6u{AisVQ;G}w?Gm=JHgHy;ZUfD_y=N9(`5IS9v*J)#?CK0Tj2E*jjXpIQor)`rzI zBr3BAsk{I}|I6euDl;>eqYy48rZeHqo~AuE8J{U_n~hKN>T@?1A-!4`kX&cH0FG+2 zv`g&0QnH36#h;u!)SjOaV_wR&*$^QzI0wYo)qkJB(YW?TpUXI`_U1&ir))7zQl@Cd zeln@OJ_o$2Wu1TgCBs+}xoPfNC%?<&oc4V}Q(9yn>~G&m9D(P5at{kBjrCyQ^qI*m zcqab&8|~TQJ5od_9h}x`pH*N_s=EQ)!C--}9+jpc7KJ?C!qNEXrWwX5m@1VAYojD) zc8To{$&I(N3@8|UD5$QsDh=-9fNzG19F1ye;AY1yOT$x$q6C7$T^`!QQ6@y4W?n=t z>uf(8tKhh_t{rQx3-p>GJ|KH(kmFd)S58Q(U{RTx~>UPqx7Zc7jK#{#@81O=koudc|VFJ>C(kdsM+8}i6dgvGc>PSK+5>6-6oOoN;WJ+#I%(a zjKc@Ql0hNuWw!s6Cilr3tEWrQo5Y1(iyI;(r=J?UV{M3>ynN8K7oyHh zT3YY@T_2o~%@t_@N6WO;H*}dOG%>SGkyLf$V1VLyi?qb?qKPqMAQd==+`ycfm8R*I zW;33Z#(}`Xm2*s1A~WtiTCii_h-iu+A%<29m@!5Utwi;+Vw(e>wWd$xfg7 z=J@JY#6-`2#Z`Pm)TE=y3w{tcu2g+iHWi*duEkJZcMj_utb1JkTM})U73ZMD+WscO zX5+`-B)v{Rl`_q@e6gkrczbkc(0%$=qBpMj9iQD8e^)lCy>s+Zn^TRQrEG?w8*{NF zv)Sv->NlD!3hC7HB?lPDc$@%cf46)&woOTSu5?cRIl868D;m1O@IGjAGqrJ4sq6Ce zE7rD}@UzQBgzk5)GmD+qElwOc_sh)N0WK439io@Iu9r!~CAlfPB1efR&fl2}-;rk| zXZty6CiRk)`~$H4*C}P$TEhSb>m%o=HCW6I?7qI4<)TbH#+SzII-Pwe>PQDpN0CpN zPi3|$3M={#ZCUc^w<#4=^WsI$y$eCrvKyf$mBlX=$@HZD#6Ko$x_1p(zTFjY_Lon# zRqoMe>$9|SH~$m}Ld*nneb6_U)uCi%OL4+RDVZC@ykUs;+F4HA;$+H7YzixZvI5)4xwXT-rZg_IO9H+LyeFw} zFB-RBFR6st_J-;c@AC(`4HeQrW}#S8QS5{Xc8Qzyjq4LR45$*?uQ}z?^l3Q7NDa#= zjfH!ro5>6pfuB|OBIn*FDk2BtGlQ_VQVBQ~?5sfYEG}A;Z=9R;J>?@Q}GL173o=Ps8%g z0Vxptw8cK1ag=N{cA;H{PZlq_nyg%qzjt#9Hb$=sMSPGt5g49)HJ_G%JtfA`yZGfp zcZQU~zDe^|l(N0^>8O}8PXm*t*{eiO`ar6JF7ab38jEatvy-FRD@SW=PdB|es?7rM zSIrADf#t9)W!QD$>VA<^wFlk@-`{(W7wBEcJ~ai`V?;XTt~wTM%{&0|8V0b7?S22o zQ2$beYZ{3?kC>~Y-Qlx;q)v3NIU#2Z!-6J%mi{ahK+#Fl26;U*m(6mS8%$kZoMJ7c z=B&LV*6Za0)`Q_YvsG@-C$*nrK<5Xw!=_>&s>htSjse zEF_p`-P%nHD-HKi+M5)5^FvrG81dt~FZb8-3m%Nz$%TUuo`uaLHH6L}F2eU%u#9{^ z@HA&BZVLh_w~u0*pqAcDBw|e<{hjEd(!er7(b2c=#pWjPBKVKf-Mo`7&bJUTe&zAC zy{_(tLlo5VTmvK5Ag1~9@@=PusmoZkI~$cL-|<^wO*X?v0HlJS&9t8rC?r_+hx>~uD{vW76W42#6TEvgJl#!l`IwT+j3!F z+VZg9{IPL)->i^F9%v(mny`4{U*dY=TD%=S8D3c0sVy`M0F**n@?lDy2D-NpOrvp+ zHZH|xHA8o6Lfj?y#?f)dJ+Nl$kL$kB%a9}}{BhRC`Qgk2P4{?En?o0um9TXDLel#* zQLpE~9oEG{Sg769a4O4hbhvzK#Rfv##*KxA3ZPTZEt)oWSvU2v6XAd_i4#(e(;&+S zba32uU%EtGH`FJ-dbF6`+h{^#$Ip3O&Cd=x8oCui`3kUf@$pgH)BTJi!s+gZreYy_5En@~sd;dzsj##V`Er7Y%!K zB<~W%y>_}L57FYn@eJ8b!c0K}ONE5=###>CVT69ot}0fr97f%X3lc_8$(P;v!W~~X z-RC{thv`*Tc>1*394wc=5+IQG>`k9Yph%8EEiduD{88ai0fYB!#<8(X$tpu|9l4H> zYzz^Syya}$Jc=TYv#~;WiXBN_77mYg&$I6a>7^$6iwM?yW`Um@!&jq7c{gxeNL`p} z^ZO$En{R_dW#m_DTJG3hScM&LIVZ{4ehPiUXZ!dnX9JC%`1co}IELbD3c0Fw_0auF z3d_eCty!!r< zRz*0xGQ+vcjcwi0%Fg6@G#c_$g8`u8io!0^<{jlUeRfO%+@iXudsaZ?cMsg@6bZW= zYKzOwv)m$_p-)y|VHSNQir*OaVMGo^ysg%jG7?|AP!{j3Q`}T0Y|hZTK2M_+$>cj; zlbnnP)8Knyb3`y*Y(9Hlz_Z>5;}9xY=#+q?r%UC|LCS9Hu5jk+dv?KCuB_UlX+f?p2q3c*%`2k1 zmLkQXMq@=9RwqZPkVa&vPf%)Q2E9A{C5=mR#x8$yy|pIKepO=h|&nXv|8giu_8$&Wo1&H@H} zCK)({+|aQqqSrF~@=4Y0%R4q^25@SW>+bekZ)FK~8!enB_z21U!2QVNuMw6C8%{+T z+}BSHX5!#!pF1mO>t*23mnWR92K6$=%DImg`)vh2M&3!DhST*Bl%<)*Ja>7XF`S*A zw2N#1o5o3COdP z%m}-945BkF9;KIotb}>->_U2hZ zkjf5wx2o#H!F_PN_Pb$RvTdlHKYJf6>rI-mL8oFP~*}B>598-PTRncMbqF- zWSKy2*cF~3u*r0TP_1 zP0MUcZJ=cjDyoPA7|0!RmI7@rZpYaC)%cs~oH0KdKb}45l%DkxE|4S?ImXx0$l`iN18*OLKt<#PYH~JA_lJbtL=&2k zO(;A#fGtdX!RKcZW*mN_MU#!9alx^N+(w!8}2oIzZ5<79q!sV4TQP6fA@5B)`K(bwu31x1C@;g6%wuC(DE$n(d#iVhHL6D;-R6IMU+_5}cVgB^l&_!)qlIbs> zyzY3gQgFC80y((lQ}cFr6%>I8la^`vc)d-sCom zi)csNCL{KXIFxynKJRFjM;n$m2C1ebM|F_6;c2vTr4ZR_!pH->B6~plJI`Pv!7avn zq-Iqnyl$jnR422dNgkO=E5~8aXSz)$M(>WfrTC&9(1<4;y$aFKv$kO3Lm4T%vh1?F zeL;K}kX4S`kwWT140|RH53IX3u9u1h_hw(~n~b|k1n=4x??>bHFkJxzqPRAe6+_1L z4qVUX2m2Vz+I56!oU~&U)zjLVuESp#6^>wRt^?2o-r~zUp6B> zOOZy9r_n$RCy%vK!%62OJO}d#o1+Q83x>74Jo52!?Qz1f%5AP^kH2Zi&h-ODgdejE zue17*;tds=k^KC5B3HrpXgcN&QVGEDO zZNv+y%#t2%xKMZ<*OL?mR=%GAp(-*IV(lMxrh^nJE*H#(s4ss{k{f~@q&S83px;|$1}p3Uo<>wB=-MJBLMoT2udju zf0RI>?7^ldWClVCr{b@Wdk2CrJ(b8@3?m9YX&YN?uboRH-A{IxaI~R6&S%e>4vFjW z=Og`yoR4ECs>d0VTH%arP zpI-~o-IUB>G9I;6N@`WUmP{hDRM_!dzEu9ai*IQ)>m}JUR&g|JbS^e%0ot;Cy0Cc4 zAS`UF1)m*{tsYLUZKEC*HjAwBO;(tIn}$&#sOoXn2sl(Mkn3BRO+F&lkbsubN=TT0 z(BPY<#sM6ir;_wDJ&M+O8)4W3xM^GCri$@Ef0vvAVM`CsQBM&NWKzD8#SDI`ob}oy z1Gh2|U-M2yL8?7ttiX;^jT?KUuXQM{=wLP1i2*|f$G4Xuz}5QKLq_l7u_jNq-Kf)?HeGXWBw?wtpWQ}b>Ep8en!2yR*RG@j@Ip@~>LGJ_5S77FKE7jlyp z+HWmyL~3GcQJNR>$=0+^`_zJ_1j-X6-q(s^+7*u+w}tc@wnjgV6)Jrp6}7zs`KS8H zeYCAiF}SpHA+S$b0YnO-S_!TDhcbOZs42vaV>?p1XQV`r&kZnVUBCiqW>zONFUtm0 zE-$El$xG)yoMMq#`JFaz{vHio1syZqXBZ`C@fTcuSJpGSW5~O$wfAL zv@#UNMYX&b*=9RX)>GvfV@hZp4eHj6Uo;(Ghx%QTrl=+RC(5D9nfx=Kw=K)oAAtnC zUESbn!C0}=*=&ty4#Uy!YOwk`P0e9)HbWh}e`ULiA-}w*qHQ)U#@s{KCfGk{^tP{+ zsr;nf7qZ&Bb~>{1#b!~ao1ii@FFjm8FJOShusbO60h?j^X%E;N2h()j5u?5gR0Y(6 zA8C%GO~E(uc3Pfi1DeX?8%G6+$+fn!2Q*GNgSL&{g~$*;tEcz_!xW97aiX}=Se7xS zoa7NPm`l^^3eVyl!uYUJpbmTL8An{Q6y4ZRV_m+r=k!i*Aj^1(0XT3h!v{D1oq=uIWB&A@Eb)=1>^3DWKKI zBYP#s2$VM)@IIHo?|00dKkUhwvgywZ(*Hq|D}Q40Ap~=VRcKqNy0k;tpQpJxFD`?E zwmyH7sjY?gRSo)F+5{`Uvgy6ti}8Ir_9qU47;S)dam&PJu7todwAjhGC=p)ypx-I^5y-J-0BvQ6y#xM5ro>9! zpLT>pDE0PZnTAQ3SsoG&GWv8a^g7?AFJ%{CI4YF+aLTr^NYMjlQMp%9MtuhvrK~qC zj7;j`9p!|E7aTgxo2`0ZiqT<;SFaW7rNFYTVvmZtIo)%0A!L8j-1wM+vG3>rYK( zY47ksps(Gf6NT?f+N3v(yKLPd6&A$pOi9UpX-T}Fo%8@X*@JC;nhXR6C=BGc@3Zes zK4MQp+8uB|vTi6~*eF4zRSwi3yK6-r2<9P<6!ZzMvM?OTG3maBS1-*A@ad%RdvUCu zE)+5RVVOfOFlLB3Q4_0PsYLiCgaEQwd{04zy8wB#*A=*>w{nPMtBLL2Fj(X$eFa4i zW++@uCo#kNY{&K!#r=a%cl!db$c`z0<=46TH|`++mRav@Gp^&Q}VV)tVMGA zl@KBvbgaCv(np_gjJTO#m8NJQIsD{g2`)$tq}b-3VGnu&{Fy#e6rw5nr&f3tmV3f52U${)~r?^Nn9M*;H}m(hB_VOr20Z+nZMB!1N~6y#87h{>WlHO z2mO=Z_xg@^`17D>a#+9i)-+uf8P{xONei#nz}}12R3X*Ye7u>HSmZXyY*_=rl*3Jm zd$eb`g7CqB?aCz>HI^KjIV$KD?+ea?t7oeTo&&vC*k5F7^1BebUUIIVXShVliVKhG zyewayEE?+b*+MI=$J%)-?Fn0WuTe9Rh6EOD9FgH~0E%fHy5-YqoZ1)Cp8!0ac2vd* z?^7T*TnjPA%u>k@?4>Ls7qLb1ObR#9!+6tIj@SH8fv(Fb86>VUs$Y<#I;rr(O96Na zvc@vc#r(gGxt39T{e!Jw%he!dRI{qoM0w4gdGLrAWxdh;Ar$r%j|JaR86C45D|6?f z!X!teg9D7Kvjfx?Vd7ziTWzmoB@L1k6fu|ziUh2GlME{769w=Gv>H)WfI@p?PQP^6 z^mc7Ivn;GM*?CqHov6u~Z|s;L{5$namkt)7Y2v~?EMrzGP z6Q?fyp5@@Lz%8vy%0Z#hy8RDKfSk-T^p@w%HOW;TFBOgRpx!j)y&+ETxrkWNMJG!L#;$Y!yar3e8H0d$ zVjhM)IUH`Vg=bBmaYRnn&~s*agv_1BdC7@mz5!fkfS}rW&v*YVeDvhEb6zfmc!O$4 zw!Kvyp}9Sc#(^cidJ;d`LX~!*r~+766hD_89OyIFSbazw#3?a3!Vlln~iV8x5Sv`rpMwmDpQyEEDqzGm+9Lb zp4o^Ou%M1%#L^3eWq?MnxTEV$Ftb$*;FvEh+@HK$ARf7gaUBVBSCK(7eR`7NB;rCd z?hFKsAH-e{62Ak}1?Qqe(!N#RD`PDtTsg)*p_>m$bJal4g*?~mZ8}F~Fje8S*-#qN znCQ|YC}3q0N)pDN4rv77Xj&YYHVb32qWNx+DvJer_}2oTe0!_lAtEqoG{iVEIs}ra z;>0U`*(;R-Es`5@!@Y4=cS6VV%211p(8M6{+6>u@NyRDS5Tve8QXNzvb`GF>Ftmo> z{;nJL2W?fu9~M3CZ@8b0dVEEs@Z5dykwlmes`E7_-ui1*{ zz2ppgpBjwb4}U-X0BS2-ulGCNd8g&TV9Y;MWoPF5fmUPtj|>JJd!R@YbaPpOU{u1K zzdjeK$yeu$2JON5^(W$y6pAiFOH(`be4rD2ZnoH57= zn%^@7xM^#Y)SoaJ5AQ&Lj6u;I*K5dG6~M7Al`ckw>@3Js3t)>T9kYYBFN^_SAo+n@ zh#_p*B~V5&yz`&iG|fN~x4jfcdoWWpXupIr@(AM!S>7SBRo4iYm-^wA!ZSb& zger&ul`vC>26MY>Ad`p7xGW+cae*rAw!SwDcFsDlSn>&fO|sWzpVci))CygqUdDDJ zL$CKuws66}Mv3|S=cqsdk_X-;YshzmTp4xR`hd4PYSs;M@M;Mq)Zb+^uls_+?eny_ zJ$SJBj_g@P62#!P;QshlQA-#)(I|Mtg%7Y|jy1S1>`mU1rv8~P`&*iz8aWp-;$?1% zFgh0Sd%Eq>YX6Z7tNxFf#l9IOdmg`kdzM~BsIN21s~gVycKl!lva`*4~k8T6^ zGF@#J4>t`7i)nc~g_?4Bhd8XI8mU-H*2hc=qWPA&tg|@Kg!-o zrc`9nuYHAWa`GCRP9C|_GX*^oaWLCuV+h1EitWy}S+72t>Bi~dEGAeOLm$#ldC)A0 zLaO)*i||4b+W|x3v$>8$Rn(RCvIt?^6X|5!M$KE5tu_}8*4B|Glo%sEA?#e!i5R}kSmuEHx820X3flV zHOVqbMN!XrK>|lrW2+6}f}Mtt7Uuz$K5`_b$UT$xE+SM-xdr(yMk~4~tU)f>jyBlU z;YO9+50W9sQsX^4Qb0VS>{HC%p@8DyrlXSn0jqfIA5e5YPqMt>xas=BYdHG*^6L(0 z&IQP9VM^lAMYO8Bgd*Ig2qZ8fPHL3(`=rSWH~SZRP5XOK>Xx!hpvJ<>O@gbX5zce7 z<+83Rk6NgtJeG{_qS#Op8ny_;>qyMdumK#jXk|ua6Xra?s~-a;z0VuIQ)$urJuOEA zQ>>7Rc?U@#GpjmxCpO=R$7h>tWx-6O#wyb9*mS)qE%8j#4Xo5R(z;R);!#} z3voY~U6swI7i6{D8Y?rz_&D^s({u!uqd}JvJ;X!)roWMzLm}iaO}@*kF*JhXAe0Vc znQ(_Uu}GsR=dh$YhP}oJ!Pjsi3d!Q45~(}~@UP4SwS_7ZK#-6v7`F%~ZGua4Bp_GMrt%3}fjr}bWB|MlM0rCJwf{R7``RgYaN z6C5n;xLm;vQg@f^+NaCp!?%AZu>5xNwcTEIbx>R7`^eQBHMoPNA`iQ~W_(1>VCDgJ zdeO`@kFq(~ZVhd`S(;&Yfmj|NYMJE$4D$xN;1vmA2p6^z7;Esvv0$AEILvMkZZaD} zU)fIQS<8V`#9jJtB!-~U2y&|x3##sIpAs$)YgSP#%$S3$tRZa9V*(%n#mLc$A`*>L z3GoRca_dfN@Ir@IoPl^yfN+`UqOoya6XI+dmjlNI6>Uw2R9lz^hNH>O7EFyucsV7EpkBCxAt34J$(s>@=NVZav~=n39V!j_q)?#BGH#czf1TTl9P>58%AzN47|K ztuVRDbFSQ0#=UOWYq+_3#GjYEX#LJ05C@!o6;m`rS*194V|FANclrpZr z8dhm5O5iB2u@#^&TA$zldla#>&`1B zGE+8Tw=w)$q11xqdfn3=k_))IvMz=>u+X-${&vBmB|1y`iTDus+Z#tTh2`tk%16sh zbtK;(w{v)W{@6|plIV)okJzMvC$-+4$O--UyTo&%eB_}LO2rq=m<#Mf8845 zZd#|^SVFjy@*}GF%aw~d=^9a*picDkoFm`VY{Qid2GGq2Z5NX@jZePf&|Fm3f9C9o zo5p=!2LhfI<-`VPnqa^J|7e1Nb{F;9jhp9LfsC(0BsmNw2wnV_J{!}$(y)=SigJ0G2mn2Gy194QKtX+>+qY2hW-KnUL8yhj9IEp`pPRaYqzuxt-X-5 Nr#w&MzrFOse*x4|CE)-7 literal 0 HcmV?d00001 diff --git a/docs/img/pai_token_profile.jpg b/docs/img/pai_token_profile.jpg deleted file mode 100644 index 52d68bb7b571dc71ca51b62dfe722daa90048bcc..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 55722 zcmeFZ1ymegvoG2cJa~e;26uN09yGx%!3TE+2oebH9^Bmm3~otq5AN>nZjb->-E-bM zw(dFKyZ7An*2~PQo;BS)Jzcebd+)0H)$XV1r)2;`PD)k^0D(ZjkLNe=GzUlki16?T z@NkF-2na|>h%ZsFP*ISPQSe{AM#my1ASEFtAR;28Vx}RZV5B4>qUE7uWM$*z5sKEeUfM8%=z{0@6!NNYD?fHBhfW?G+Mb0J$ z|61h}0);&m`pV+D+$5b4K4!%e)ad7eQ38>%D(9+R!a&hzU^6^W&m6Vc} zk(GP@K}}slQ%l>(*u>P#9Bkp}K_mo6dV#08yBCD_$?_pJ0~|Uzo4+F zxT+djQ(ITx(DbJXZPgv?EK>L>iXvPFTFqj z%-_}eXU+bVUYO5%y?}*=M?k(hcx?V#r|)-W&so!&~xL#U;-k*)h%td&&xmZ z#|D2e@CO5bFz|N_81;T%!@mb#z%G6Eqy;T{--(ow{I!vg1QfQOfa=?kzir1km0|xJ z{C{PHJ?2dq+dWk^NZaJgN>Mm0m474H&UodUV>X80>vcTp!oPB`2OwTAN4c*yw{fm<*Xy580M+5qW0T|)kQI^yuEQHR#x^l4B@^1w$QNLno~GwM>@yY z+N85B=F*Pc0y_Kbu!1*3f7rw5;6xSQGYU$Xb)b!y#Ic@A(iH+zPr!ODMCY;P%@gq3 zxBxsGQMbZ^Hv{p|95?tdRzq!DxzJaiz(e}0yUljMlb!{$C;GG5@jwiJF4&BNgY5^v zm_#*dfMrmz-hFA@94VdMe8_O1w2IbDsL<3jcF$%+j%p7Zj+I!>@BgVs>1--LBaCZ7 zZuSXycf9D^Pf}F4{*WzJuhY?DfKaqI^?PK*EpXd~QzO;lk>CV3;JoOfg{N~iZDS(W z6r9;&ecV^l545#+d3QJ$eKuR=JMzMO0&I512F)0EkOKn5OTRV-fCA79a~I;{*a0Tz`>UTfZ35760#`6(b(_(_FxO&eP5Fyo`m$_*@72&jP}GA1yHGF5O!n=x;IL?R z13KDnD`$$DjxDCSOYP*)*Rsy)b$H^S`33Knd)q*E2<_jc%34smUm(q6<6KL zjG#RAP%hUslcB$9XVTa8D#zaSld*DiZJZ+E-Q_FG+JK9{fkIs==(BXewUMIEf@Cgc zb32buXwg!4;CsuL(Lg%!dz4*RdT^%jN%S@6=)fD*B-)XyWG@ib!qjX*kVM6a&rC)y=R1iPa*T8#N`FI;#S=WZ*iL zM1N|iOv2BxV!hGzySY5l56v@gG!3-8v}W%k)C}&$EJr3)b&tKspEuXGo~*Mh9kgyz zt`UzKYzg=+2K$kSq@Xa*B>UoCMZ^7;$n2|geV70e! zn)~Lx>?m~eq|3k1a-_+;`t4-33|?XQX{Rh@0n-i4j=q1{wX5t2cC(#Dt)*Rb?fFfT zmQpnxte3NeC-=~v(jDAHqwZ|{OQVb{pD$wqRkBxtzYq5ppMY-(iSnj)?DqoKj&K>X z8xYkw{4*V8a+%x1PL>Wr24aEb^MuBa6rHedh$=$=iezjTRK zrQL8JZ%irA%LX+h$7;%$po8Mm@}_r=7^Y&UQPT8cct5hWOI|FlCu`h)*>Brh6nTKv zz+D!(A^L3l1Q_%`yB->DShBEmSVk5U{wk@r4IPj3&0mip6g#i-kDO&8k* zms8OS^6Bc__}m}OWj%`AiPCdcmJqHB_%|Z56EwapJY*T!Q8){i3VAw!u1ZKzxyq!q zjvso*&Fas<#H{*qw3=PzF23v!@n3b$(}PMT{(mG$x`u6F?fX!K=~?!vY{>pDNax| zOZxphrY}R^zSX_f@$BK|V!OPjtwj7lbm`lhATOU+Xl&_-ya(bgHV4X=J^Mz|Tq*mdW2Dwth|V>xC4taP_Bk zT!Q3^9-U*CQMqr2yGWSQcl$OG%^DU^VmkV&P>=dz_;&f-eJ;D7Je)O1!aZ~6MVUh<)^TlJ))QK(RoOpeVoJqWTB-cd66Hd~5?eL!g@^AP)O# z_nVSJBj-nRVVvZ>RqmDobPS8*mMS_P6Z|I&( zyY51gIv=$mXtq?QTV-siOq%C%%HNCH;x^Zdv}L@E3va^6Lyal2X$n|-_K_oBL0pdH zPf8!jmC&Rsz*Es*get=XI+j84s^|ru<|Am87Q&x^d{j%8JuxR#cTdu8m)n<+Fw0xL zbCc|(e63vnA2e`ZL>3+--Tmzr33MzMEcnXqVUs1BWpoZ)aeA?`Zh?}pE5Rv8g?;9N zG)Tt&Ua=l`(a8%>F0qHfw74$>iZZ4cmlq!#&ibZ_byJHY z#iriW-eZ4KzCoDWRbAbqgkMJ`(NSwg#@y5iq^n`cUZ+ zIBoR^ru@t-!OY>;O}W?@32UAVU8I~4)GC{Oh*-UNYp3)2jKY3u%wJ_#kOa>1vbpRQ zzantx2`KVgQCT&3W#L6pWq_eKJJabRq0rn*cJmE3l4hN_m&6weWjL^GC&hUG>o&cJ zxxmZ~DPb1e%TD)GFgEs{%1Agcr2SaEqS*YlE%=a$Gs&4jqz;qQrTeO9E&~PkI?zx! z{x`&1Ve+%|+VHhJmnXLZ_w4Lz)V$_-83-kn^Wz)AFXQeXBntA|>{a*n3Kk_+&~F%8 z8J_^xC%{xYW#zEam1!xIdYq-4CSY_s?zP+yrLqCJ0|bw_g_QU^Oea3B`A) zSF;@ELy}Z(3>rH=K4Rmx9}pZ0RN!6cPmYpyfm}K5J-UaYS?GE-LmE;b!U)gKX?vNX{b~Hw|^S}UgSMdHE)=ue~Oea4f z4S!Q&k`&BwC6{N zi<&c?c1VHyPe2P3^a<$a!dD|HD3^5h#&S}S&_0f1+nGGD=fp*3c-sakR7FE@9F}vZ zO4)t_dhC}bu$vV=d92t566E|W-%aF!TXU*UUkQueh{Gu5gSiKDiQp`5i=c3N^k@}~ zAAJg;&5b(}m~qhDDq3rEplc;>n-F||JoVn-TKWQ6K64~nNA@*7RXZKE27b5xLXoop z(vsy83a2G6*3Y6D36#oi+G$t%<)UGUVXxIhX|KIQEEfZIzI)=;X{V|2#4e4a;#|}B z2>0I^;a%W{gsQ_lKH#=8w@kL8&hKFkQkQ2DGOEhjY#@HzMS^u;N4($dNM}1S8A`pi zfN(!>Nr!cq35pWp&Eti6tciq66hg$$B(DV=b0n}5T~FEjDRiLm_hXzo$5nKM8G}U( z8^!f#pRUzXDf^=kn-oiX(smgp5+wC?u$R`?EHuLYu6gGn4>f}T91U&nFJ;z8*7>z* z#!=rk!*ZULM@X(a>2-Pne5GeXA8VPekKTF8|4PhbIXn;^Mu{b@?yVaH5w6>+33WPn zqdtXC{PrZHs|8&f8uJ8*k29`zC)rz5$)+cnka_bj zH^V#>kOVDaOnescw8o1tjyd=p)MIsw#;PwrO-mP@|>&d#WoC?}|o&15J&P@iD3XXUIi@$3( z!+F>5R;2fS6sxzU=-l4SC)PGSyNv!39PR1!q3!ID(ynx`JV)=1nK>5f`RcH>-^N~k zEiShXyZR`{){k`+2J=h1)F>5L)z#AYBFqtWkTdLFhwijPyK^@F_{$Jl?c&Vd?y^i9DXpx5Q z_2yq1L~A=*);t04)1eKGlblASi}#Wd-D=VAzRyTPqB}{iRAL1)D%O(Q99It{G_N_F zOOS(^F7Wg|#hBxD$s_2C5wjx>3GF7t5}Hq{?6`NgzxQwhrt8!%!4wMHT9rPYr(QcKc=`&c`bz6p@6AHdLHZB*CJ!g^6Vvi>*1vEcs!M}cRSB{%aq zKn9(7mtq^1bUuDp3jt`e|Hyx*4e0;!t3UibvHODmf0pkK+2l^1G>`Kc3d<4V!)C%t zZu~x0-OzV0ay)V5FUWuEdH!9NEiDfop4v{7_{UbrZeey&Q%rn>#7}+f*5i%K2BG!Z z!@4VBD{&k{(SFhtGRHHtr1I7fI+TAz=g$UvaGJ$`L(d{|ImY3UU9)9~=C^!2g#q zpfay32DZ7v<&Z{U3m>Km!~WnN1Hl4OV~sl{aK&(INP*(x)-l{MEV!C#E$g5lc>`-Q?mt?P39&iVxZ;krA z`MY(+MPZ+#w-|Gfd0|<3*%8y|>LjGV|8YN>*Tgo9h#=wbUI#rx=r_iL#nnyoWnZ?$ z7>$?3Lb8G{pL5e++uB9W*}$aNgrrXZwkRQ|8t;W3zi}<323ZSdJK9Fwgu;-2e`~Y5 zMmO_C2jcb4v=Y)Oe^(r;&JJsjmD7Aa z2qDaMu|i1&93W-<0Bvz1e|%orLSE_Ve)7o-nO^SEM;^>5R$k)W=T`aK%p)^UnD@WNT=K_96d@}VEY9h5ltEjXODO!N@DVB~e3<^D zB<_EDzp}3WtQM5P(HRAmBWU$i4^~ficDZ}Tr}w}APRfZhAB4|9aWM&_(mw@47IAqO z>dHA5^Z$#s`wwiL*#ZrD>{=aiUO+j4+6al^aS1)UxYe)+MST$XQ>AOE6oqEn^D+vv zs&5%@?+UgxurpRtQWB~rn$^%UxGsiDTtm%`$Ys9>%baTZ
    7l`i3PZ3eG0)N`I& zU8m15DHq-)f_!l$zZPvMA9MQ88IxcVY{ITZJH##G z;F9?AgPN&bWE0Pe2eiiaj=Hwl%Ujc$9PSjM(aKuqdabpn!hU8^ypMcc0?y?T&36)^ z&w0cqBj$|KJ^q@`YJ4LOT?J0%v|N)=6gsjsVz&yhyx-~1e4{Gfe;Swn4*mEK{1()6 zo$)AbfbaxNb#_07aGBn9Om%BEjdAK>*WmBREA$KX{Rq-D`S1$KDALMf!uIuZ&TMb_ z3Anwke*#9Nsy{2at`uIBBxz_ij|z$r_%B&SCLOc;>r z>_g7y8dgu&8x)e(Kbj$gPj9A{$Cdq7skT4kAAGO#ICxxoyEJn@Oh0!(qIho>w&o(D z`++|(SoPPi65C9_vv(or(vtU^jbojCKM{~gqB%uecHRAMT$oT_bJp$DAQYY1(tyrS zRNSZYlr_`w;vhMrqgIUic(T&@O%_fHT#kgJ(AS76v4#&N#Hj~s2*(8sj>1dFs~S~D zPXLP3tXa@u{=}5*Bxk~yt#MdC@H#jrNrUkF4>T!hST&auwTF#^0c{-L+whvw0Q_?8 zolmN3NAxeyR{71oq+g`0^ayf!uOfStai~vR=!9x6#At0Lm}jG~N^Ro7Ze{YV&1N5W z2QdmVfO!HhY7-`Jk(0tcWx3qRaU`g}gJ}||{4$KyR?*>!?2T?Cx!y`$kJ*cxR3h`v zp&VavqK9%^uvk&uUYoSwkLtoN- zy-uhp6%rN(60!TltQE#{cMAAN{9WRbp3l_f*ChSo7?!NFHK+3moA{htac7i9n~08G z0#g+(6|Bcf5zSoL*Va~C#42Qs5vvX(-{#lY*@!{JsLVOu=)?CzLq|H1vg1xE5~f83 z`j49oan>b7-}=bk+2J|y?Xrc)e12$g^a08ufQT6Ea9+O~wqS!8jre9T~ zt1dhNvx9WCIW*4|OH=)~GhbC3^)sf1Zx>$ALb}z*&pkZfzdLT8;vU9uLu&fK*F8W9 zYjV=lI~T({sM&h*Cdp%EkVRb{tF^YN$(MI66)<%1YziW~?{mD4s0m-mAlxBP^5ZPA zJQ60{-v2FaL2l~yXb{)^C17D6GR#|C1+LRJ^Ks&>Fk;i~Fa0Udp>$hyb~twdOon@0 z>gaiSPh~Nzw8h>>q-dTPH`)eyS6V}hY|GcHaWSYI{F&$a4*2ktfnGz;N_87&PYVG7 z7RvS*1y4bb&94jD-8PUuXxgygN0N96a}k_S40cX2zbC*t-1IvCa#94h+jdo;sYwsD z3{-cT8^fEIVn0uu^Rox<2Mx9mZrT%|nts%`6vM;0^h)1?)k1wuhINk{?rj3QF7xC( zd#((KP-d9-VDx({(bDb}fndK|0nH4`72_0>YfI=&AYHYsVh-QIXv#8P>FSAXa6p;~ zby|}A`~kF*4tFtLbpja$08nLoQb_D<%>Pz7zt>6>)2`_H z!SdVTs1ceXC|VSc=mK`Np&!{Y=XmK?a(oRm(O@P~-kA|Itf>osqBdgPW=N<+sY9Q3 zOi*RzuI#mV^wH5#Y%eAivtX1G6FHd`Oa+(2!dz=dS^VIFW5*LPJ=>a8Q1-RX!2+2j zfxPPpb9k4I`e$)R&ahTyWGdaW1}U7Mf4rvP`uVha^FVZr0n zw}+<(<%uG1#pKJMfL`h2f(OUvocM1X|HhlgtVaH#<$z6ufK{pl8a(4@2?lgK%b-}X zk?xLbrhr(Zbz<`Q&P%6Rok5jUBm8dn!@aoJN4b`^hcCv@Q&Yd}XZfug_SXK##_Y{3 z?uz4NDZ-N*lA*NEWT5Z`= zZomMpG(x;wq`__LR>zSKo`X+7wsttfqnuKZ_ulB(6JVCtwsZ?Kc4&|Uml1C3OfHlc z8EEEdIMKH-D{ogAz?bk~06UNdfM$fz%KL=m_CYv(FJT7SA* zchcLxZQbc4@xfE)3E0lkw9cE~6UI52@J~*tk5!M$_Vew0Yh8EJB|_jSta0BqKv{A# zclHEO+TWWM>gc>F@Lp*O)x+=G+$hKBC#VDjI{0AH-Kg9(bP>rg9MI)XYrK&xTzrQ- z9Xm$n z&8>{04`0EZA<-is;8Sgp+!Sq|*i@TYI1?Vd)D_DsTs#fUb<0sY;uQ7z$VO3}q0}_) zg#o5BLC>&qq(IWfGC9Q>=-=1g?jgks)T>xuotw~w`?mX`)8RL{XzyK=LvEvGlypm$!!0P57N zDt98d9q)J8_vH&pp<++@t^^lt;mBg0pVSlOdHiov*oj?VcpaiCO0jhd?FEczNQ^r5 zKwDh(rp{Xw$MM!{bb2RrV0{dc+v#?>r^E`UXLfEi{Fq7&(*@Kf49!QEG09*BKL0pH z7uv~%UWnVeu0LZ&Uj*AgtW0Zqj{qzAGtYogO+79hElb$GU7J&skjzziS{!@)$CuH* zh|5#dYZ=$q$K4MBnU4y3&P-Dm8;?frB6|ZXpCvQ5ParwJM!IvSa~&4mQK{965+Q!` zk>cB|&2EK`+$-2-isg%e;nvXwImBNu^yI>;W}>+)Z*ggKES+*gq#s*1pL-~iDr9zH zN;XilPSwUKc=AMaw}&^{8;j}YrR?0Zr^M24XK|#sRPqD_u*jFMb?jxgTW?}RdEO7U zy&zAeUn5H3JLFe?leVUyp);~F?i7jA>kx*X(t2V9_=AoHa2q+#x1J`e*g1J7y~JUO$1=Q;7B;nubb*We**ljkPZrxcAzSfeiio3c=mrMdV^Uv6k`#HA?IpN2M)M?MYAt??;z*5* zLu;}Vc^RoNnqL{(DGV2~D*Zeb41SGm)k=_@oe>_s-)ZXPy?Ahc0`#Jwu}TFy4_>AR zZJ}J&Qd})#f^0%v3F}t%F*_77Z-gPu;2I zoCO|vt+W$?>PW-E=8HU|OwfKI{=4DoiI3rAo`Q32NzJJB$_N8!yp&-d_$MA}{Z6>Y zK8&w(UvlgySj1u(wwlLrkA3;ssalm>JNq2OE+Q>5-|gI_oGi^{5DXSPkLw+E@&%=v zXW<+2(bf5Oh{FcIs;4@W6LQaFGa+sl$h2uWcOMeLFZ%lIK7SWuFUZ2Q5T-IHr!5C09`;z?SunNnq2 zWl+`?KdaxCHD1DyT@|GkAV%ED^kKAZ9n<6q&_8)Ti=u2oRtx%0Z}ijufk^bUqsfqO3ZPz~ifW`CVwa1eAH>RShJ!Y3k_%nr#vJd{&ce<|Y(Noq$f>6br zK5-dmq^nTuow*E)9okG5L&UP2+Dds%3`YAd%5dqA@IYxKJa|f5=_}_R;dXP$H|5~h zc2>k47qAf*Et0rP1C0Mj5cI$0H>&J`s-^6x(wFAm4Y$caLP+TT?M0j<|JTq`|Jv>N^Zk!q|6t(%00#O}IvNFW-z>_nzhbbq zV!(dCt%-s@RE{b>zQhz2=b4prreN#n+ zN$wdbp}dVrY%h-3SC5GK!@Y>4Ow>Mx*b(P65qW4cZsH4W%b6hMya;x_AFykS<<}9r z$V&)%^Ez@>9osLf&P7-=QEwUW1!Z%78P&$z;E2A+h&wh(D-RbfRy6~~)F^ZbLcI+0Y z%v^yueGtP)59uNg%+jzo#MuzMj1YpRh3)q28U7%Eh@uWWN~#(F5*@XIXI zWLZ8l*n5L1nfF{x=c+m}JKolwDB-yVUnyak z@R-b3{eh#)>5+{EXitG4H~;wT;CXnVbbhA`rXP!z@ zqZG|ciVy&hviEextTq!ELh~YId=pd)1kbyPGCqkCV5t!jtG260h?zN1&9#4tH@Sb# z%T02n7tazdXG)7EjVT&xl=L9)~tXp%*Uptv^p^Pr=)*;IUQ4;qwhsn)um~I`(-n&c)xLn;Gz0d^N-OUh7L4*TG8|nX6P*2a0 z)H-a*dFSe8J{-2ji(h0>p7%^{H4WeXRa=0RiRLiep-3fL&Qu|v$xUsmiG8$sf z6OiCsl(U!r=w@YGh8j?SIDc72X4m7+&ketE)s5h%`>@Vne|*kz zEcL-xcAcxXUi`}VYS2T{=%tIZ?WL^?S z6QbMqdc)2()*ZWM)ry5Y^Fv!|%s`0%HKJ;m^G9dyiqDfc zVANLqQ^yURB0lQ2X3s!Xa~Q>tpT}RvhBHg%hp9$`_2hdxgMVIdQIDT4TqjKS;p-5K5d99KtM#(vn7`QEl8r%yZDu8r@muB=WiTMR zIt!~#Qe2g^Uk|Zo;fq1+aQkfCg+b0SOWev=N^uFNZeW7To=3{P z>uAyXW+KZ|N!gcIuqD{mP8p>_@Ix&RkIO;nPPLv;_YB1n*1p$g_E=wO`#n)&ChXp8 z4%LQaoa&ZjSrb~C;B~{!sAD(bfV`849DBIzSuMGfp}sMJt`2so*0u3vWdtQjwK1&> zd8n)5!0#+ZMOgt!?f+&cguXn_J=rD2-sOQp{D*sb`&lOP2Mw0z8nkT|>ah#H+ypH{ zox-p%S>Ry<|6HiV=19G=@|&cyT_s;w4-ZUuPDG2yOQ1>8Y}(S@CXYc9>_j4oPy3<7 zXbJ@$9D5xoR|{6x`Cf*VuEX3#^HI{}-gs}smRB=BU_w!H6sAe^A_5jB2KPW0Va)G>FNSI1}&-EkvYEx zynv?}T!?(PM(PhZovVMgwsD?$v8YQJNyb*o`1yOo-47%DdkcoPS74mKO`lXs&4poi zUQJ@1fW?Sf8crzq)osO$9&3mcOpiL#0u<>>p%eGf}k znfcpd)o8(8rvlo}hQqd&X9AzZ@p-|X*~5pW`^+ccqPXQKtUR-pT?_IA9Mpm0nA$E` zBfVsX2Q*+p&K*|?wV13}ti8RBoJFF{He6mup&<_Opt$UP5GH#rGK=@Tc&_Y}V!|lG z<-$c>y7IokRE&$GrX4nUIC_-XiHZqPQl$e|J^>{LnFSi@?o!T2L7djyYx`HTE&ip` zs2jr$LDNFViWZ#-IYvcqO};nRHHH!4`l{LA`02`nlf8;g991HfD{Gs=IeU`SSFxjD z>Q6!NnX!svPQxMEZ@q{@7~BZ(y`fn(fD~nSpO!SR=_5oC@@w*5t6)1(Wd^F@T6bE0 zUS0XMpU&V2^EY}C)&fW8Btco;ew3IS@kESmdKct#lm8m!1`W~W78a;YQhCJLL(fCH zc6|Tp1{bM8!UxaYFYJ`QHe*#!MA=WJLHu}j`rUEAJvTO9#Uj1qZ}(JX_)d-1mbx)3 znwF;6{<<$HF~XxRh0lvjkY?q>EPKdV(>FX>nqpk7$^ z^Ceiil~Nf;E5TZGf=0CG+daUj6HP|}#;>HNDe0xees1QKYfl+e9v^+-4DFwEAeh7o zp-ps(X;AT3nV&3sY%|F-?=RZ*GD)>UILrfj>(6!R_-sEV1LN5Z;R4;fF5y&VG1fU1es~ z*1mSK=jR3)TP7GDcF9j)5cUWb;>Yff?MtnZr8<|o<(-uYGEWWZ&rIWMOGFLVZOkjR zm03n2Zc{vu1~Yr9Tkz79r$Y>imV{dSkG)LI*0$lT`b%+W)5L|K(;Lsrqgj`&idKfu za4Hg{(IZUcP=RQHLVN*St=PL2qUb5|dKHiH?t=M1gSk?GC zN}?U>{w`|wUpg%%VOd4LOAvIw3WF8%fdh;n3X>u)XPct1C1R95)lV`a_8-rl1g$Eu zP0iz$hZvylyw(x3rOw~Z98PF8vX6~j%IGQ|G_tlgoKzKWGc}nuGKe!8i--da?HNnv znQJI@uqq+TEG^*e>F`gCV{G#{3KmnE<1v^Wy<%nCORk^%E>tn2PYvPTpMEy6q9rr> z%k6}-(vj&m7kq?i-(b#Zut3;((<6e^re{U$L29bxS7|*fl3F9@J4|E=yZeD{jW=7i zhqtts*xXq;)4OH(E&jP|W~NjoU0z>5xTKSgJqJpc$NW7YAStc!?v*6){@C_DG}?Pg zR^*&gmh?Wd+I!09xj?GKu&-}PsTSw5^2Yv!j9$m7qLMaT`1dx;71CRI>5asBuS3T$ zr%V*fwi^KcE3q_!aY0t+_bC7JwL$Ss$%l94s_xoa7T4W)R{T_kXUwidaYSum zc-s`&r%}Mydk^01nqDe9iF}|oI&D7`apmjcS4I%wbZsS)@1J`Qx0ps1E?6njPq9@# zx$R95^Qe~5kxm>BLfm1Xr6j_FFV8?R?^+(Rifiy($2H8oJs1*^YhwzBPN`q9OD2__ zZE(xXMf-Kj0bQfnVb^s!-!j=*;<74x;M^Y7P=he{yr{$Bv5L77iX>w7VAa)+^^_cm zJWMl4jBIFJ;1P)1URr=Ji|ifwJA;`;UvsaW0jvTi-FbjLw>DEpD`s7@@8O{pookhw zQapvPLnaEQo-um%w+W6u1zau-zBp z)t=XW_~wR--YRodi|if&xn~o_(@swTXX%5_cF4;PO&T<3v7@5tNg7};OI#MUPCAcf z?KeZ6?zoSh>r`=;x}c0ss4R*Z!!=N~L_D;CA<;`lMS496*~$rQxP!+2?;DCTpUx?h zfq|2E-Y3jD-&FLfd5L}zeSY)-g-o(U*Y2(};_-)d*FqbsGLw_plywbR zCXl}|zWPVEwFa2vc~APzy zWuF43x<)$ho2_v;^=n#a&`Xt z_G(~m3)%RzdTI%~+Bt7VzZ1n3;}9ci1fx?6@tkb45-kH?#&Db5&;--giO~L;mK7$r0i!|r^Ns@&sUn*mQ-0*9PqvIW zwX0N#NAX4tZOdB2d+%pv=5{WNtSvDjuns?1I8+~F_ifM%46hR^BdonNti>5k z_MywV+CssGC>ZGS3q&F58bv2mri#gn(%(bhW@UNt@bGlkBJ&iy^JaE~g58j!$op1i zWJHuD0ml*(f$Q3zG~@cWt1~3BIv*v&WqyU{WKm72GJmw?u{z11U46%UBitH~8jxa^ z|JJa&?zwq&h+;2jVr7!m!VLGB8o#*6X2hOtMYbPjB84w}58+ej5a&b0Y_JcY$wzlj zF60g`Xi00y(V_EQ@OzsiLPsocc0F+m(AQ=ojWPege}ntuxtKe-S`u)}hD5$=JiZDz zFU+mKGj_$X*uIR(gp=FaG(o(;!L_o<0GqW){$LH=NnZI9RPBSKR4jt?+M&d zb}pTB&3Z5AdY|+G$)q>5xZanjq0fG%kQ4TooNf2`^h*083-@N%)BYDfvIUf=Xw_-% z2GeWW{zdSQpPZk^`C7urgrL{c4oP(DWe9(@hlgvyPO|FGqg9pM6mD$lpg7^MQ8*AL z5*$hO=&S#pzlE=Nt$?8 zu(+LbhYwxgq{5V^IpPr@jZ3ZNKebddl>91Q4ExE&pCpYI2G;G72F3Lor(LIA!B)f1 zi%0C?Rj}aOU@&v2J(RkoX0*h}CqxQiMfKS!A$<1whl8Jhwo~sDwH&l~BluLSR|{T! zj^E}J5CpFV0h8;@;MKAHs7 zZ$vZpH5;SD(1I*Y18*L`fBmuT?0x6yV{b%TXC2<{R!CEUd81d%C?PG1lqig;l-1g< zf_N+8$mG$1co!S3^;&G5aFb)?s1GU%bwqf9)-VOt0pc7XD~ck!oxmDb$!PWL&-JO% zhZz(VQKN9j9z!>g1UPfdXg^V+)}oaMoH_aQ#|5uJMswqcRXR=L1!*q_OVwZI<(w_- z)-vMEX=<^9U9M}_j!Qc#-79*p=)RVG%!{uBh6bJu*QoeB**O->F3Q?ccaYC6Ty@A& zf-_O}`um&iZkz_WI)s%;!Htx{`tI}*2N4e?5=!6w6smPBoyl=yG}kMIWQBT*dGv8d zDwuN{>Z@5e1DO{s!!k0t%oImTIWsu+mGPPfY#!FQ?Z=9Qp`f&)=NM}(`w3{^IW0@P zqnR^26O62}7S`Xl?f&fANma`|rZ3@2<(3Y-wXMHz*z1;)<-F6fU2TO(=>E!6eU}c} zI}pX^kaw&Y7RoCPSW*fect6zUrOID?i~e2(k~}IPg7%~KnMT$ta!lW1k~_U~S*1OvVrvEh0A@=0sYJtqodt9CpJ*_#GGJ zhMNTE*vHwa1r~CGI_GkD)URkM?rd5${A~oezZQM|_pK?$9=OgO_I5PN(RMS{(ix7DO>5G~-*@!&j(3{S zvSLb9_V~+#%nWNV^fGwogazu`&C#i9s+wf!3`_*SnYv?sm?QK{TPVJ`sr4oVOBsG8 zbY zd8=uE8Fgqt{1&n$JU)yb>U}N4HFo-tb3hs~@69+_6A)Uk6LDl_Md`YdT4_o-Mir~+ z?iFiPY{B2t>?2!sd_W!&eHyXsC;Dpo!1mQ%nQbIxpBTA*z&oNxAxEvHGm0J3iwn}n zFs>)tuPey%AT{6wHk-6{i2A{nm_I)`|W z#V++0`0uB$O588ANYe5Z*dIMvQz8jxbiIuJe@T%S3b44!I?4p<3Lve|O-9FGJ}=rte0!9wX8PZG>E#wPtj; z4XbAz4@KHSyG=pT8tTj)zWfe~_6MT{Z6nYJg*A_Ud&=%BO*U1bD#TSE4lnQPcNKzd zZ4as353cr#$AQsL0OGz5+vcGy)U5nI_3*09xvEZnvRU)QZX_u5P{fI7jhH@uD&Rb{ zKcGIn_g&;wNE7o>AuNn29xEIP$q7w#WEH;2sr{9^W2Duj{{ob587J6C*|&!k?+8lt z;5cZ_9%C8v3O zLBGL{XH@6QYp7S*mG!}l7bZ{mAA1IhRmSezHh4UYhmIr%3)FS3+I^?AQtBdhU(GI; zb&^)#@$u1lM03G{9ozRv0Ybz~;2clD?me^Rx|!Y3gpLS_l!xlblj6B?_sGgTKaP{q z$~bnsQbz`K5_`4l-lH_JFr`_~4)tYmyz2+yymcLA5`fEguW|WUN z&08Y+VXP!+sp!}(F1@7^)uQ?crB~>q8X}dFgZx%-@pqG=tbEsSh39%XXf91{GVr{E z(M$3u%j6qDRJRCXgvMq9xr%qWecRyv+HU}`O-iXi6)bs9T3GqE`b=N8!&^D*3!CBm z?q#-RI6FU+^cfYW2Yz-!c-_jp=V$EIp5x64xgfYNkICk6+@U! z<5_raxNj(^NZxjtoZc)8U7xSJ_5;~l-jakXN6zTY(8y>IN7?ojr#jyUa8tTO`%G8X z{d$nlfkj_ba@c|~CI(27cy81Z&*`!7uQhI<JaR<)t6D&T-d~OHq=VI%m>7|#5S8`P|LmqAKK+ML*n3@3`t}f`T3EE_T@#$y!gh7g0d}U z`48@zoO)Km&!~8ruT@t^=p>A}mwb~a1`c9uH(@W1*A!KZ6gBk}MNb~Gp!>8vS57n3 zbD+Jti^m=8w-n$&1*^73?DI(s&w5j2wQ*`l; z1TR$mf2;-Hzj6@eF)6iFwGxgW@uAduB|41K9wM`l^SNW@EC)wZYxLf)s@=ii^CMZb zU?p9KSYw7QhDwI%i}{u3Xweh{m!9oZ;worPM`=6%BJliw zQhwp@z5Y#D`)^zPZHxcfK7jNTV0vD3Q@VGnzMX^Sh1jcZ7FX>k=lPTU*~_M7wH1no z11$dRsja;q6KXf78W(ZMMvv%s?Iz{;y5+e+SOL?fJJ4 z{D0O5WXz=5I#E>-rDTN0n9K5U?4msK5`6n~sJK!&%j~1M>Qn%=+`aXJ_qww+(0z~O zkb{=HL;SG7P0yrKY#Ui8|7=A7Z9#!1gTyJuaWO{EtxMfb`H?AhC$^*chEdyeF^0_o zInpIIcS2DyEeI>^~ zmcT}-REwMYW3|?vk3aqAwBOt@dxW79t#nL*m+@0lu{qMF#nMA>rb4oJ>Jt($cQPwFVEF~>y_W`gj%N4 zeOE(Zg4svq|M+oG*RM~g?Ki+%`2ojYM4%BGDP!Q7@Ln>q?rU?FzxQNhf9UW&k%noF zAV8C&+^?r9_%~p0ljZ>p3Rb>BF+k|&AnN3!iU6%YI)7&2(sBG`z_;IJYl>&Y3Kk5d zR-)MEv_Ny5Q9lOh=5GSil9i^bI4&q6gj}b-Nw&4b+6x4kjCmR;uSrtijbINN%Q48C zFd@YT;ts0a)ZDacrnG+2Z|4~nbK22ENSP7Hi!u|u`dms@h{?EcAQB^Z^s&C21|gqL zu87}|wgT@J8Q?KXNqRJxM4AwrYF6(-Nyeui_wAkgVSbBqK_5r2S|2Bz$2`D82@sR7 zROn2?F1gO?$5`hYWY=GH{IXZt`(V8@Ib6I#ahd3@FO3zuAp0bp`IrlstAn#*zr{hGenL%zbp>jW6wdzhU055RL8GB*vQER#V z5VICjiC@h`i4AzSK)uYIgA@Gydzp4904vT1NS}7mKK1&1Pm%AC2?}>+cvgU)5hqKd zF==^*35b-6)^z3wE%%z^9LZ_b;KG-L?4#vbR*=T@tE=SE~!}*X?kj(Cx}ReX)cld_JT$cH}xARun$n zJExaJZOSc4=i3rPCG9fm*ZZ~HZ^p{+{DmYHOc=&m+F#u(J_(NG^LuS0COnQ?zff;I z9(4z#?*R%*B=T<=B#9Z@aW5IM&?h z3%qg$DpD7Z0<`Hi#VkN4#P*VZryb3JsYeb zB;i{!%>fR^{Wda?H538;f0ZAbZ&!)`mBkk~w}gJmb-R#D-p@J?IUwp!rfSs6YYdc| zaNUSqZJa8yvS%=CCcGbt`D~;qO%qb@3F@R%gs`A-j|H1tXIUb@ksT#Nc zgPBH0CW}5~*h|$I+5Pm#YeYFioFVGPapJye<)qx3l5=FzioJHqt4Atub9Bh0va<14k}T{)-1Wm(R(ky27THUq#K>;^IMlaZhP>kd9=|d33ct6 zMwl@azuS&+!j~M!Dj6w3baQ&QSfa5&R;&OVDME0!r}*;6F*lo`XOBMGcH=0GPz5)m z>R@wJv{j`gZiI{rrKZKdE5PHAIH6X^=!WHob4?W(<)85;UGZP3-Xu-*a9dF1J9eBA zVNa)9_=s{ZO4gA|%~o>fi=9k}0$-;SzFNaP+CSF->$+)onmpaynvXbrZiRHPOxRYsKI}s#&Fy1(1P$VC!5nbaE zMl1)DHf5J?Gb8I@hT0pSv2R9e^>uI0a^80lC}T08)aH>SL^S zQ)HE?9zBHXYDqHp47#DtjVO6<@nyTd)|DD{9Vl|MX(SLj^syrLkTe7$auA-Jc@ zy`%FvS3ffhoiR-e>NTS@BP8o5LpGrNvrz{x4Y%fR|@ zWJ0LYP`QDh9H+^{;uOwSX~#9{-hXEfqQRX-fCA_>dm|TovxmvXG{UNS#m>^ zT{mYY97-;S01ovv`FXtIl{Xj#zaU~_Ru{c~7Gg1xyv1p!Hl-Wd3Vlq-Uw^31`e;h? z?Vs#IE+foXJ6L%5en_yHx+A7AVEUR43VGJDmlx)O$S1`-EHB)R%FyJQ{8?j9qSNWL z6@A(Fhs)`WS)G0|(GvJ78~E-OH-<0yFewz^qAkE;ze{cve@Or&-3BGjCqSB^sR^c` z8$h~KKJ$jP*NLSzFl);`Lhg2^RJI73g!d+ifM1X=iL;}q@?d>oI56DjPOHG6uR8?IA|p!@W`9C&PC zdJOm@z5IW$1Wu@`4ZFCRsJ985c(9^)NnN>sU?89llCTUp@Hz+CRhT%=iic3`>a3qD z0FN&rX)ZOVRd!SD5@%DgLpK*7eOWzz@&e+g=2ryD_^z|F^nnvuT+2eMe>iqFcJ_V; z%=>OsVig-H&oQ6A3%}mw<_c8B#nT({@v^WeZ0VdHx0Psp)^I@|jSCfHMs`bKNiy+A zQ~VgAUYpd<+SM3kw;?F&Qq48YH7vARI%kbwj&UwY@zuc>=SvBG4pNX&;>oIN3;Ry; zXJL*n`N$n67HVKz|6u4{>=g|M>{(W{$ZGK;;ORQ>mc{*u=68{VEVp&k0htaZ2I;zC zuj?CMByltmIzJU1$zAp5EvewDG>~$0pw2GM~dGD&vUK9hgXx z(N!xYYXlH3U!6tcf&p&EtIr+nI>%B24!I5~{1MYTjl^WC1*T4C_(L3Ds@=CJr|wmr zs21Tbd>`HR20G|=yx%U)(-EDexkU<5xd#YQpCh$Mn=-Z#!l{PyDLG&Z0;$NE6%{QS zQvxUK7N*S;#id{EcwcNXY0;b8w==ZJpW1=u4cVa}#a@mHy`_F6m6SkXv#$ ze&q|hb6%u1a*~kN`2uMtL?NByGU?Lqyz0P@Y49fI*do&Po>k!yU*fjN@CGrB-MI74zP^->q_G$HVmgzr`JC^ScPpYy z$TNg6*~z8hg~{H>2qDzplpjlzA2r)GE+>9kpmQMVeoe1J1c;9?d31+ckN_*e zI7nnUIJMzFkF%PWXpa}jgAgMKva1JIrKCLJN(lWgC+#F^UXvA&a;HwDQ&BciM`7B} z;?E^GQ^U#(=|x`Ywy|6o1GM=RRpv}$d8%$ zSDu)#jZ*}O2_V%AtjL$!RC~s(m{*lZQM^;k=wsu!%6CKI&d$xjDOYHD93QviXl~;; zz$<7Hu(1Dy!?OgM!OEgfS2|t_EVLrt@d7Q*b$;wxn_}UBNM+qCzjSi34{3V7GaOu>@SU=e_fj8Ggcm z@_fB_5a=UqlL@5l(3@-ol3y^l~V+iBDQA+q;jIL=O4KcNS|6l^?y zg&GS^zKr>#rh50IM`|HoZ3<=q5*!0Nz<dqU*Wv!)+Kh%{0h$y*qSS(FCf#?4w>@%?e@ zS~&ix0oK|&ZExf$dX6vkJd!iggRX`N?0}%F6orAbcBqMK?(_L`S`NJLSReXj6lDc96l6$7p?1lV_Smoo5kO6;MT}r-lEnOW5}?lMnng zY4y}{(CSHN`&T;3lW5jr#2FC}?>sF!AV6_r_O1D?}-Qs36RWpm#t)>^7K z`>sEXvqMqN)ms@54*2ok;0xp+F17dnkO4}si{Ofade-j!9|Sm4UXS^96Fugf=-Uw@ z^)4ZXt$@S z{K|Cdn_e}wT@k9lm$3KD$mob)IW`-SRJ;+3BIR(uUcb)li(!cez>K0azzHrB39}ZZ*V7*hu z)3-X_kuFYS4N5H5>cywsi<9+xpJkIUdavl=^|=D#5t}3bGYMm$dm=iuGASIe1!hvl z-#%hY*jR_8ng*r5FRwlOnx@tKsjv*S)ynoz78m7vtJFO@-zXw>hFhGWNig=G^r2OL zWKX2wvs+*h71yzQB^an*V2%-bYG72@T!c<1xQ*?apn@*+iDmbZ*-mMct1fuH?<_o# zfJk@q$_HAQ*y6);--mycH9e1y zynK=yf)KO+R-tb#pc_Y`P853RPt$fmuc<2yFuC?oi@TgUsC?AjlQ}I`U#6lssdQdl z`Z#TN3@;^~(Nu7Y9hIF5*<@|X^XC$kvT~Nr$~7TL7mG~4&z%~LLlS@UrQ=txN{hi9t7N^6qTn8obx<9MCm^u98z>oU?yJ$mo3&_Cfz!%TvX$HzKL?)J7hg34Io0@SLBQ8l;b{n&hj8vx}Ae7 zoOv&0Q?M;;pPM(2_*;a_*sER3$pZnO)6w<3PYfb7!1{~>y0!e?-xS$U6E7fkcq-38$rf5XG-A;cH2abI8QIi$@!~to>ONQ4yU%iK(XbCmZy(ao`zbB zgi3eSe@}KLc%|0sMRuNpWrry+GXrn(oipqUd~7WC>)gyi*ccRoy9TFkZ=xz`eBv!5 zm2>*UQ0?d7T_zUT6||wND&<+t8s|too;up~M>vay3;IWp$liD2TBJtaTcsm~cH6`w z7WrY75^Jl@_Kadz0)qEwsJfH9*I-t+0Qc`QRQib+6xA*WQVmp+QY@=2-(UgztyT+~ znajE^>1+YAC^Z_U&va3hsCQ6Qa80p&WAykhxi_x{IAnf^&d5sxoUdR(&@jf@+KJSn zC=2aM`q?+#<-tXRB*fPQ1QN8$xaEwP1jc_ zNu76lxPoK@;GNba4KVT?c$Dt*+bi(9#RJ(4%(~u5)Q`0F>)EpuR^xugnL^xm{X*=` zL?^+V?hf0;f~;d4SrRu2i*TIW(T=_uBI(i*$9yXHtkb@8Ye*kBg_wnQYi5POVNVZ? z>c%Q*Awe(0K7D4v!U?WIBrjXwxI}>Kz@AfjIv$g<rTb+pUjKe-i2T(BU4!-^DFdzma4L9o7*Ua*10EHU4GA5Lz3}M3sIUn;B7I{ zAt}s@I&9U35EQH(#tD<}l{3lKHh^2FOw+_l#YR%e7H~FxY(-~B0aV5ZtjV*=%kzYF z2JpfOc4bol8`VBQhx=Zb?1QiDZ$Lh_`?(6bfS2%gD|5U`vnH`7C)*e2(oYnxww*@F z`Rr^CVAzxJck!H2i3XOV0fgBpmEPZ33-bI$ANTHv6+qmIugubv!CjjRjV{sN1dAik zLfc&u$DJ?S;fE)Yz|*;uB*lV&`MQ7334XOYL{{NZI!ySZ?@npTOz)rTbBpn)n}cDH z{zS{!BQilVT9>rl?HnAbPw9{NEnhCRvhY1klLQw&1*`#L=lSw|egj_eAV&F6M2P~` zK~<8l-r?qn7UBTRZ~J8>BFX+wy^#NdbUsmvfr95v4p_f`Zi9W*ap$8Ow1_ppB6xuc zhq6_GG0nBMCV`uCnwHpTbGSvD*n!u~6~Jq|6-MJ8`SpzK^HjgW2H|w z|G4(Oa2j0u{wmD)?JGT!j*?++uTqH>9fb7sSDG6h4utv9`!?)m`U+Aj(ZJZp_&*2U zF)4AYIXmd?w+%8qtBnW`*U|NOlvSs>1>Zh%qXhuar4;mvwYv~fE`3ImzXAJ1B#*(? zE!&1Dp3@ZD3DS8sY%d$7N#*P%f=))!wrwE9?&Y2{5q20bSBwc1iaA+H-Ab{NrrP>R z_Mw)83IR?olpfwsld%(lT>vk$;Z!rbwMBa>lg_ltU=DiSbZz#kFC&C|k(6;<4b;Lj zZ&#vlC6x*I9%8~BF3PuQxvH~n?BCGQs0gmR4!M(fyG1=TK$~l!jo)0OEfXidpcqGp zi4Q1+^w-ou!7-J+FEJ!&nURMs$|JI|cj1>|=A_~+wYpfY+NL7(_pKLp*KLwM4k4pj zJKxD<(nbegs18Pup@uYyrtKBg)HgH+Z`x=vWIg%7-E36vwO5}dy8>iJPLJqmC5cxc z+0vW6!m!VY!wk(&BO@&Dc^M2uFF`E&0leIU>5o~oZCvv8wr>mT zpKPuK@7OL|A~Ti7%tOywPH^C47aX`Y+`A-6Fb%V{q$}lQn|JicHiv-wVp68178JNH zg+widXH?|~M;|l4T{r*1PNaK5aR2zypmw_r7BI&Sui%pku3~$USR`yqyC_ zC$_H28N*wmI_n)ycl4Net0RL9LDDi}Z|d0MLG#-23cErv0}59eiBO_1|Ksmz|J81n zf3Nqq?f>?H|1Et0Zyg6sQ2-%j;&M?uCW*E18?f+T`H!vDxeBs?#Fd23KQ_LEfT59x zk?E*En`~(S&p%Gd|9j$}7W&&be|yf~==hsD{{NL8d5M`4e%wx!ennHVR3M{_VS@j&^)!jxj%hgL)UcG>xy(0YTAi| z0$P|>HrufHRj%n-5{)1P zZNY-Nab)Eq@#*GP=aCIox*hu`G)I6)CJgK1mrED$UqHzDh1P~}#}TH)jT`l5!NrV& zqw&mgL?f8(xGdcmI3ys=HQ9S1SK@+P(x_TX-H3A_$pdBtO9R1pVPx~-_W;4XBbA0p zQOpwX^n?>;iYYTEX*rH1a%B{>yDHKiu$t=PhML*5px0mXE|6`|lW1fx?Oxd8GQb+I zT$UvDQi7Hcj^>`Um*G)z_*z|O$`v)%Z>%t?@`FXLksvxkGTPyhAVqE|&7hoK?A?I1 zd+gqf;61DL)3{ZP&B(YoX0zTNw31!#pF5@w)vXQsg{cRtSY_y?ypqMoYIP30ZNl9s-`mRH; zst0jL0{`we0b`6uaG;ZyQ|YzldlKg)fn*!XK>(S4PqcV|KcC+UE5y-YS$Xo%h~{*O z!dT^zK4wEFNhhi@*W$8VUUc^6F8{kP#=iB>m)+r^xgTW!$o{O^bu?xtUrMng*qrAp z_VaT-uSrA!?B*7IYo{zunF8JYJcR9>E{LT0?zDNe!K#!?@1z4?sWt^h3-#>DhHx;o zje-vCO_zBN7iN(KjfJ0WvA3;fEdK_yFg=DyQYm&cWff133Kl5N?>;jQ%txUeviJ*b7ogLJ8a< zlwXg4Pk#)#z!brYJwfLqwbs}c5In8W#4EpL{Z}wTSmws!;OVZiOL0%iDkGsXUsLSY z(J%E#HATBByGg7bPhj2hq~8Iag8PcncA(iMYllyE0-0=9EptMu!(7=0HUdAt_qeP` zp$x!;$feDOxiQJj5-W(m@W)|Vsxc@1_@rXwy9*efOqn<|U57XBg=!-daw_lgqZhb=$W3qLD%Dt&8LqN6R`YYJPeE+M6m zQSz!FtpZ~!t2KY+3$4HTl+r_0`c*qsRz{>z=(1e6$ZzG_3B=`H>iE9hL3tk&GugU!HpA>oaLY znb8sS**^jJ#AhYeY2n#u@~i8>t-W>OvM=zZxd-XhW`9v4g}W_oJY3c>rWSul8hC=9 zwUUn`B}OE*c9w(P{fAh=@-HlBXU)5>bW&hM;1D8Z-ELI4CwVY1&Ur8gp$!t5#GXCz z+yU}ozn}jQWZGapkXh3m(YV+GIJ9Byk~n+%8&LhtZ@O%G$cH0}m{rg?PRouvic5s9 zZzEcWiPA}uBJE3o!(4k!3uG4QH$XnZe*f0ly4*A1d9J?ocPTpdChi1h!Wwk+*fd2E z^_IuU_z#mSLG~zLY@H>Qdi2(E*YUOR{uSX|q2^WH!^qY0)?|YBcFT_A@WSd>h+=YI zE%f8_g&_1nTi>o;x?Rh(y^oD2>`R$8uHPlU=ifw;8mFF9^<5#LsKg8!-wQ*@e(YLj zrItyR>+&9>>sP9-eg702V$3SsefT({|JXnI8n|N?I)Jsx8P&caM+uY&WAr5f##UHp zwbj4u`q3_~N$b_@x+@K^mtVCJ{MFEaw;ZYbSb*LTM%2&pguN>Ju4Ab z*$F@c{2)Y9+EWXdcyA}&QIAO)|4iH%x26Xq>rtUIr`_Sik@ET8OZDZU2fYOV=fbed z@RZG(=3?YFe5l36#3|Q0UgpRG3zfl4ZOLGR)D~wFsp=DGetDs#GSGYQFvZu=JSiou!)~gm1|$T-pd0Y%yjod zlaA;02ua+TLTd#?aYZ5sfgEdzIn&t%vx{SEDmGL%F0T)#)0h~o&0gl9j9(}y8vhiT zf93J>F&|ZKw?YJe;bZ?=vsivd&h?0DJ-DGMNoaqyA^w1;J}BGg^?ueh;0w|0=)YF! zP^n};hA48;b%{hYqfb|#SHqLq#A{<~b-Z3;C}5F!_1SuRtDr&h{cuuEuhfH?8z@xFQsVv5TeyD% zKGsdJe;M;){)YcO>MHCUxv}Emt#C|e#XdSGJO-?Mu^o7}ncSJ?o4?~RGh!d*3>hq^ z!%Cz)KS#(IUrrQGRcPV|5NP1!!hS_g3g|@gCj%jam8T1h8XAB#Wcn=cXXa4|>30R1 z+biV|8VEE>#2%^%$PM=`TQQuW9f4)AM3|r^-keOq5cwO`nYsZfG!2i7Omy1TT z4*#6CQvOw}L8r>qp3W5%0<&vcSb%k>?nP@|0z7yDLn7*s$pb|Mjzc5S%?bQHIwyK6 z#Z|zyb#3{!h8Ao@KnR)pi93cq20uZ`y!gr=^ASk&9^6% zeH9kBgsD3)_fjx^zs3Q>_h~N~NUq@QlRUq$gUY<#r(hraJ-y(N*Nr40d0G9_BYn?I zS$PD~xrVjY9eL*Yz(iAytGV4~bC{kvG^LOeG9%)Z0UB0ae-cFq3*7dB+I~jF*j^QjWzhWPjI^XoCa9Cyas(J~| z8owNtZwOuJs24g%XG)u`R6E_=T?0Rr&nVmou!_sN3!V}8P5fUd?9dj8vQ zm-5BCB-~O_n@f1VOiew?`MswKFA`=rZvuNq(w|q(yg_r6I?__LAr@8+zsQCRnw*J( z{AGqRII6|LVsQ$NZbKUF^mvQX5c*ihK_a;YS=&*x)|d+Tp1b+xq+A)OzQP|3{g^Sc z+7-K{8e@lB?A1c(NIG#UwnwvtT#gnp6ln!JdAS z7;;XCG&4U4B_31q4~j+h_eR8NqB=@#I(B*pE)9uZYjS4EYnn9B4ECEu^)0GqUx`nq` zm&wxsCXiRJU%)GTu(ocdT4aIH3DJUBh9oN`x?sYEqv!^r?17o3Mh9> z6Q04`n(`ya7u60%dytMy54l0IDI)>ZvcwuVU`RALlU}_&P;29*fg@ zXBc6AL{^Z?(SmV$Wj14(pRCjZ0;9;9c6?%Qs0_qJK87#5ZX1w+pmu`07^0j0eD;+l zbr7Ghw~10?gf(LR0FR1!KYZOGFQSUeWQen3r_3f*?nsW1=u|q&WNv->!FZR8_@(82 zO}*VC9zt1O{3685#m%YQYI&aDyc;eQ_^x+V<Ml8FeSfF94YbNbGRgEoLhze)@0C5H&%3?9x@m<5l_4=(9fVc3vT_>YdTE z_8e@Hz3qMW`wJ!cz>*>w@k5SX(oNvqHuUZ|w*C40>>;J{H`%#pA z9h={fePP(by+Kp4kX}rN+v#Q$ExNd*qfkrUOSpHeCo=R0Q+)Kg_q6L5X2_X|LOT1K zK2xJ|_CX4j_)!bS^fkH|%tabk#BxZG$`qfCo!^JYf$?dD)FKwLZy(C>PzXA|BM1!E zclz}cU=dxiX;p@i1xK{8VIiT5OOWu|y;QNErDwqSfXCppnyrHjI_;y;$LotzQAUSZ zze3v6_tdQ(i#-V69E189b`Ce2Okw;NNwa1PU+YiNkcNG3ftk+K)HT8$VFeHun3NHS z&b!ezdveC`-x0x3|J4ZF+!{71{2}S21w44FM}W^h$|bnhEp0W!wW`e&qBlklf!@7q zU!NyFPH$CRKyc8+Bn%*vuhS;m-E5r8o26ZdyUi|$O7&VmooAk9WNtRV+KiulHl#+o zi}SB>4Q-z}FWBe=Lrb-FmKo~#4^t9;R%)_dNgidZ|1o45VdyR`UtWA`;SF)0j@jVN z!#iSK%fbbyI(U_WhAv~Bgj-gnHs|#!K6xPbuujm9qQ8}Pp0G)d_XG~x$ovL`MGatl zg$p`m_K@}*XAf71*EqS^iyLj)A5(Cdi}zSU_5SG0)w zlREC`NDpV11E5=`ffmQXd2f~7e&*U411z0Xb>i96cKSc}a!FGHy_(ilOjd(Vs;-`y zWYOjX;5@4~U=e%bYXb}yDRmfU_1rKsJ=LiHLA@1~4@K}aCZ-hh-%#{bs^M+eEt|ZT zgzGk-w~CDKYGT_IF6wbCMKgZ7c+%K3WrGW7CsYI%YjwtCeR0O5nH)t+zs{TN9k8B|$F{j#G}XmdoBiXK{%g(rAHA6`io9nML)# zC4*>V)_-bl^`i9d$u!b-TQPtawEg!r^xH!MgMavCVqm2y!e^kvZL|H9I7;1O;Ky1O zLQlXxd}Opz=jcss=CFPqMNBCr`RM-`_2u7FUADyW#zkdaR?@91KC54{np@1FUw(GB zy%{-q9zVj@`0#c0=$5MOG?Cv@sZBu)vd_U;0k4xe=dhi>NP}0_2eQ#Lc z89m=2z+>oHd;sb1YsKBaURyqkSw*MIvT<7XX~Wd9J~qsJ??a~ea{)s%pC#aJ<aB zPz|T5M%Bwhj|4L{^-n&u@-~dtW1Jg(hZlB;T_-a+(E)NF6JMvV+99l^++j3f`)sRe zC)yzU3jIhW4bxKWUcm}GL>vOEuWS4mV-a7=x*u*UwRWc_Rc$Z20efH>2`Owm@e2kZ+GR#D4?dvWp-mW;qk;yAS1$Mn^!Yq7q+i66As2^+3VcP*-3Mso$rUqF*SUT z%;0e*bY0EN+4VX>T@Lzn!fHdTNvy~YUR|h&m+S~uf9WQ=PF;_*ouJ9^tHs>>xMHVV zX~?UfkD+BDj$dnkor|eb6P+){rfSz))4#G=nrA@lveX4O{B$89RKzYyas-(Y(G&Bg zCBm!hd6%E=LLqTp9(X#2Yi_PxL2$%wo4+tFI5RIO3v|mv@|(RZ)@6w1t62r8$8s zn4X5m-erK&p9kS~1R~mi3WJJ+w(45Pn;jTyV;p1kwe^;J9Q8?m)gtxB&V*n5(<|&h zn|J=lV=SyG#JkhF9=+Nsxjnh0rItB3O5+1669>~n_hJ^F#y4u~^5h{@5l?v1#-6`tU@e4S`< zL(mEY96w8mXSt3QcSi1P@NH!wuxn4}X9N#`9K6JjBQ*+7ziGMyya)tW8rre-P)Mn9j9tjF+Yk{!V*JI{L?i?FKtUz`P*aaf;{#UH_As*+^x z$vU9P3aE|_{2ks#B97ZR% zY9;YSDG0`rO;9lgtC>!y}i`Zn0+n`}2pN|vo^kT(JwSgmg&Kg!N-G~Ri( z(y_UsA>%>$d~XKKmPXY{jhX%-^l4@0Q$boZXJgKbFcn*pMYs$_bz zMv6&S@hP262YhS(7*p305LS83A^C>Bq?nxQ&>!N3NQI6y-u5NjAlde`t6r8j;$=Px zm1N(p^-?$uj~_yRo{6r5Uz7Yx1XauA7roqt#06Z`FsfGs z-Rqt(g*BR69vzI1LmD4b6s6>HIfl%6#7Wu~oag`RYVcZp9R`>cfOa!5+%nc_U z__ymj!&@MQss<#IV^TY=AIUY4)-a5BqsKcIGFp=#?(*Js>;8ue^SF zTiGa9;o**HVue!9bC_ysL6<|}tpRmU3kn9mE-uyu!S!AiTy9E6#g2Uy1tJVIqr@_& zn+itIa&6f%Q%K+RQhkq(lv}H^4q#m-(4`Hwqh{?f5WPN3M85kKIU|KY#P23~M?F*~ zji7w_tZBtIKyhD^FE(Ppr94@2lryz9Rlf~fxvyfhB>PFCw~nVwhCU5J@V6AjhcCCZ zeG!qyo@#vh$zRYpUx+@o`%B^*cIt=EFG{phxLkPSxEU|jiBViKG(RFd3&M&Fwy#w6 zcD~8v*`zQg`w0ve&t*45}2{s=R@LEQ=*?d=WaeQ!R)u z2Cw(-E=IO8?q*&|U`{$3MwNXtYtZ6JRP4b%71jL8#K``~&HJ##Xa0hF(+E}oDFm22 zlUBCtognY9@j{M$>MC&rXe>A~UeUu}rm1qq{t1=zAs+X!+9Yb>WMsVFG6$=aX*xL5 ze8&Q*$wSqN%rB9o5y6p(S<|Ukl|+?w!f1db5jfJn`F3k+{0G6lr#tQ-%2#L=w{)Fm zbv__6V8%IMN5E2Igv3_f6a#WVtPyt{$2_2eBzPW)CJ;1*;4qLCY%7|>o_C=4Fi%mO z-XiiCH)V@x=0D)hU0@Hb`A((UDfP=xthq!SB$`JO<2x2{e{dh67YBMboUFvtqX^kHlT zq6+)?q4Du7OL-5t5kU4cEKmLkr`b}@ujp?+lL<(@JAU(N-UIYm)I5uWU4;z-u>lLm zV*ckK);om*-4D?0{XMG^-9g4^f$xI7Vo#~_5OIy;l{3Yc-X!H~3;{yMpQe6Zd65_* z$u;V0jahPGdO|ppJ-cil&PhC(o*+eou$Xf8nNgJnn94Y~`G}M&%6{Ni?W6v&m4&FM z+~J7wxopjZt*vpm=hTS{U4{|0Jmqctp00)pn&Q-D4eDeVPBc_y><Ezjzb-?WerH zOKd5JInf8H)I$7+$=)BxGenID^3bXT9yZ4v%|2&#d@o6>=X@RM@N!EAFtb?z_W6>S ze^lo7v*SUMxhEICW&U>>-Xryg^{4elQXi?gh(;PS_@LE zR64tImmLzx8~ndKJ~?{sN|28=M-|pIM4{=gnW^snp_t*=@#?0EQaBjzY%EKDUAqYx zc$pFH?RUGQDbpD9lLVOldEhR=B6*XBH>!_p=Rlu|5Xs;tQ-~5cTnao+2qfEVs$rosIf@&`#2Za>xfL0o8<(gUKSDclL251ToO?y<(c8 zpKxvbbtf7RMfhzU&zroRpbLx-Z>zF$DBCe#BGKBE)uv;gaNNnZ9zA{WJhfAPiwJHC z7r@`iejkS*lupjLH7W1!Me(x=W>jT5THs)yGg(J_^>|l@^WhR2xi}Q|)cf>z+_F1N zI<;dl464$b&AS|^2+*kSS|UI1biWN6oVIc>c3{Vk7{Se`386a-%*t zV0kQC64cP)abMIwqq~5aOZlu$As8Ce+%W1&Qge3btB9y=wFUwsG^?T7i@B#3P;OK< zp6^nkroatCq(BGk@x?_OScU?!ERtLf4vjI6JiSvK!Y}T9Lj!F5pz?Jhft88~%M-u| zU8z_Nupo*dDjy+06HKOMihuY@@)-bts7+CoAk%2pGzm<@va-V1yBbU_^&vN*-u^!1 z(eN3SNxl*{sy9=1bPyS>PiM$A=DVHI!oUMUq(q`_@IxkV+-&Sp3sVMz$UUoNMtrx|1qs%5ZSO_`#-~Fd4_whjUD3Jsj`nu{%rU z?3MZD){tb)=`b?~^R96T#DuleM2Hee_xwK%$k$=T%BIQci?tNxjUQ^+{M>v27hQ

    E8kbR_FQo}A8mf-kj;DjW{zX{>3C?{e2Hr?y*zx_T@KGUfE> z6fDuCT<7RnJTervj`zhrHek6v*c2k@q$@RzhQtfYyq*xBk5Pt>8()XHrXl}5;LeX3 zP}RKJ1P7b#;;*vpc){;W$u^}D z&p$7AyO_7;>&_>08MZW7-I6`<#5&KW?6}uUU_T=|eA|_+6-%~Gc53mQyD?6>^(R|J z;veUi!C6+Wm*Rk})ab)2JVlr|7tUyW#{ZtFg54C@o|cq~DQA1SWV+k7%NO)SR%?44 z|Hov&s^9izosLOq(=&U$K;@0G2R=khytH-20WEd$`BbYr; z$lLPM8s$$aU)}xe4IE7O2RNta=i7XCl{-GCZ8A^6@$0h24iXz;e)Jtu-8S93>pw%n zp*!wH8{f%plHO|_vgyiKU8Q@@B8K-5TAiE7erk@K$o!Z((fz+CV;%Qdm0hk| zo8H`Nl#IObd#VJ}^uyo6#q8vpu067ytdX@)NheV-KSkYhkoB?VAx-Gb#FjS$ZIQ&(Kul4qR9!>9uX`1UC*d@M5&{6_bzN6WjTrW|!OK+MWB$HXMEGeZj)wG{57W z>&pYZL^Y2tTl58Fity5~!pxJ|PyaKp={9uU zIKy1Q+~;?QoyNvJmulP-AI;Uemfgk@^-Ao6V%apIiiE|JLat{5cUf&)y7=g#ABzL7 zy!)g1cki-Su4%lYi~Mc(%s3d^d+ze}zNz~kmR#1;d$M!WhvJNd!rV6B0;5m-e8mty zTRsNV(yrX!WfB`uG40iP-)%G3)HqF4y>oG)Hly3&&4QPU3eQ)){1f`{+BF-UTe;mj z#s`ADWBy+FI3Ywn&oGu3c)f@FW8cUGvC9?KQC16Lmj2$ulKx@ct1gBICf6>kf4{Kp z{== zN5bs*-~SnwN`pz@{RwH;|NUop=>{f2cPnfK-emVH5KMwDQOE`90u#XN6-Kp=2GeMg z7|l7OrNn57NT}jyIPmGG+>GVGVK}l*)fm+`8g8R$U^ESkrh(BkFq#I4P6PG-Zvp@T CkE@OV From 9fb9eb077650ad7f9c5fdcee875e8016a25e464a Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 18 May 2020 11:27:12 +0800 Subject: [PATCH 02/98] fix typo --- docs/en_US/TrainingService/PaiMode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index 9d4e6aedc5..da9d6a3b2c 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -74,7 +74,7 @@ paiConfig: host: 10.1.1.1 ``` -Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. The host field in configuration file is PAI's job submission page uri, like `10.10.5.1`, the default http protocal in NNI is `http`, If your PAI's cluster is enabled https, please use the uri in `https://10.10.5.1` format. +Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. The host field in configuration file is PAI's job submission page uri, like `10.10.5.1`, the default http protocal in NNI is `http`, if your PAI's cluster is enabled https, please use the uri in `https://10.10.5.1` format. Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have these additional keys: * cpuNum From 352b516f180f5f5b4d5752449565f63014c85e18 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 19 May 2020 09:29:34 +0800 Subject: [PATCH 03/98] fix comments --- docs/en_US/TrainingService/PaiMode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en_US/TrainingService/PaiMode.md b/docs/en_US/TrainingService/PaiMode.md index da9d6a3b2c..53046cdfcd 100644 --- a/docs/en_US/TrainingService/PaiMode.md +++ b/docs/en_US/TrainingService/PaiMode.md @@ -74,7 +74,7 @@ paiConfig: host: 10.1.1.1 ``` -Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. The host field in configuration file is PAI's job submission page uri, like `10.10.5.1`, the default http protocal in NNI is `http`, if your PAI's cluster is enabled https, please use the uri in `https://10.10.5.1` format. +Note: You should set `trainingServicePlatform: pai` in NNI config YAML file if you want to start experiment in pai mode. The host field in configuration file is PAI's job submission page uri, like `10.10.5.1`, the default http protocol in NNI is `http`, if your PAI's cluster enabled https, please use the uri in `https://10.10.5.1` format. Compared with [LocalMode](LocalMode.md) and [RemoteMachineMode](RemoteMachineMode.md), trial configuration in pai mode have these additional keys: * cpuNum From a738331a551db9fda25693d335640adb9f9a8d95 Mon Sep 17 00:00:00 2001 From: Chi Song <27178119+squirrelsc@users.noreply.github.com> Date: Tue, 2 Jun 2020 15:19:09 +0800 Subject: [PATCH 04/98] init changes --- .../common/experimentStartupInfo.ts | 24 +- src/nni_manager/common/utils.ts | 2 +- src/nni_manager/core/ipcInterface.ts | 2 +- src/nni_manager/main.ts | 46 +- .../rest_server/restValidationSchemas.ts | 1 + .../training_service/pai/paiConfig.ts | 5 +- .../pai/reusable/environment.ts | 98 ++++ .../pai/reusable/environmentManager.ts | 446 ++++++++++++++++++ .../pai/reusable/jobRestServer.ts | 88 ++++ .../pai/reusable/mountedStorageService.ts | 147 ++++++ .../pai/reusable/openPaiEnvironmentService.ts | 410 ++++++++++++++++ .../pai/reusable/reusableTrainingService.ts | 173 +++++++ .../training_service/pai/reusable/storage.ts | 184 ++++++++ .../remoteMachineTrainingService.ts | 10 +- tools/nni_cmd/config_schema.py | 6 +- tools/nni_cmd/rest_utils.py | 4 +- tools/nni_trial_tool/log_utils.py | 35 +- tools/nni_trial_tool/protocol.py | 92 ++++ tools/nni_trial_tool/trial_keeper.py | 6 +- tools/nni_trial_tool/trial_runner.py | 286 +++++++++++ tools/nni_trial_tool/url_utils.py | 15 +- 21 files changed, 2027 insertions(+), 53 deletions(-) create mode 100644 src/nni_manager/training_service/pai/reusable/environment.ts create mode 100644 src/nni_manager/training_service/pai/reusable/environmentManager.ts create mode 100644 src/nni_manager/training_service/pai/reusable/jobRestServer.ts create mode 100644 src/nni_manager/training_service/pai/reusable/mountedStorageService.ts create mode 100644 src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts create mode 100644 src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts create mode 100644 src/nni_manager/training_service/pai/reusable/storage.ts create mode 100644 tools/nni_trial_tool/protocol.py create mode 100644 tools/nni_trial_tool/trial_runner.py diff --git a/src/nni_manager/common/experimentStartupInfo.ts b/src/nni_manager/common/experimentStartupInfo.ts index 887460d77a..5a398bdaef 100644 --- a/src/nni_manager/common/experimentStartupInfo.ts +++ b/src/nni_manager/common/experimentStartupInfo.ts @@ -17,14 +17,16 @@ class ExperimentStartupInfo { private logDir: string = ''; private logLevel: string = ''; private readonly: boolean = false; + private platform: string = ''; - public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string, readonly?: boolean): void { + public setStartupInfo(newExperiment: boolean, experimentId: string, basePort: number, platform: string, logDir?: string, logLevel?: string, readonly?: boolean): void { assert(!this.initialized); assert(experimentId.trim().length > 0); this.newExperiment = newExperiment; this.experimentId = experimentId; this.basePort = basePort; this.initialized = true; + this.platform = platform; if (logDir !== undefined && logDir.length > 0) { this.logDir = path.join(path.normalize(logDir), this.getExperimentId()); @@ -59,6 +61,12 @@ class ExperimentStartupInfo { return this.newExperiment; } + public getPlatform(): string { + assert(this.initialized); + + return this.platform; + } + public getLogDir(): string { assert(this.initialized); @@ -90,19 +98,25 @@ function isNewExperiment(): boolean { return component.get(ExperimentStartupInfo).isNewExperiment(); } +function getPlatform(): string { + return component.get(ExperimentStartupInfo).getPlatform(); +} + function getExperimentStartupInfo(): ExperimentStartupInfo { return component.get(ExperimentStartupInfo); } function setExperimentStartupInfo( - newExperiment: boolean, experimentId: string, basePort: number, logDir?: string, logLevel?: string, readonly?: boolean): void { + newExperiment: boolean, experimentId: string, basePort: number, platform: string, logDir?: string, logLevel?: string, readonly?: boolean): void { component.get(ExperimentStartupInfo) - .setStartupInfo(newExperiment, experimentId, basePort, logDir, logLevel, readonly); + .setStartupInfo(newExperiment, experimentId, basePort, platform, logDir, logLevel, readonly); } function isReadonly(): boolean { return component.get(ExperimentStartupInfo).isReadonly(); } -export { ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getExperimentStartupInfo, - setExperimentStartupInfo, isReadonly }; +export { + ExperimentStartupInfo, getBasePort, getExperimentId, isNewExperiment, getPlatform, getExperimentStartupInfo, + setExperimentStartupInfo, isReadonly +}; diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index 413d2ee220..2ca23d3077 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -184,7 +184,7 @@ function prepareUnitTest(): void { Container.snapshot(TrainingService); Container.snapshot(Manager); - setExperimentStartupInfo(true, 'unittest', 8080); + setExperimentStartupInfo(true, 'unittest', 8080, 'unittest'); mkDirPSync(getLogDir()); const sqliteFile: string = path.join(getDefaultDatabaseDir(), 'nni.sqlite'); diff --git a/src/nni_manager/core/ipcInterface.ts b/src/nni_manager/core/ipcInterface.ts index e7c45beec6..8ef78069a4 100644 --- a/src/nni_manager/core/ipcInterface.ts +++ b/src/nni_manager/core/ipcInterface.ts @@ -135,4 +135,4 @@ function createDispatcherInterface(process: ChildProcess): IpcInterface { return new IpcInterface(process, new Set([...CommandType.TUNER_COMMANDS, ...CommandType.ASSESSOR_COMMANDS])); } -export { IpcInterface, createDispatcherInterface }; +export { IpcInterface, createDispatcherInterface, encodeCommand, decodeCommand }; diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 66dd3fce0b..3b0d1bb07c 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -21,7 +21,7 @@ import { NNIRestServer } from './rest_server/nniRestServer'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService'; -import { PAIK8STrainingService } from './training_service/pai/paiK8S/paiK8STrainingService'; +import { ReusableTrainingService } from './training_service/pai/reusable/reusableTrainingService'; import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService'; import { RemoteMachineTrainingService @@ -29,11 +29,11 @@ import { import { DLTSTrainingService } from './training_service/dlts/dltsTrainingService'; function initStartupInfo( - startExpMode: string, resumeExperimentId: string, basePort: number, + startExpMode: string, resumeExperimentId: string, basePort: number, platform: string, logDirectory: string, experimentLogLevel: string, readonly: boolean): void { const createNew: boolean = (startExpMode === ExperimentStartUpMode.NEW); const expId: string = createNew ? uniqueString(8) : resumeExperimentId; - setExperimentStartupInfo(createNew, expId, basePort, logDirectory, experimentLogLevel, readonly); + setExperimentStartupInfo(createNew, expId, basePort, platform, logDirectory, experimentLogLevel, readonly); } async function initContainer(foreground: boolean, platformMode: string, logFileName?: string): Promise { @@ -47,10 +47,10 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN .scope(Scope.Singleton); } else if (platformMode === 'pai') { Container.bind(TrainingService) - .to(PAIK8STrainingService) + .to(ReusableTrainingService) .scope(Scope.Singleton); } else if (platformMode === 'paiYarn') { - Container.bind(TrainingService) + Container.bind(TrainingService) .to(PAIYarnTrainingService) .scope(Scope.Singleton); } else if (platformMode === 'kubeflow') { @@ -153,31 +153,31 @@ if (!('true' || 'false').includes(readonlyArg.toLowerCase())) { } const readonly = readonlyArg.toLowerCase() == 'true' ? true : false; -initStartupInfo(startMode, experimentId, port, logDir, logLevel, readonly); +initStartupInfo(startMode, experimentId, port, mode, logDir, logLevel, readonly); mkDirP(getLogDir()) .then(async () => { - try { - await initContainer(foreground, mode); - const restServer: NNIRestServer = component.get(NNIRestServer); - await restServer.start(); - const log: Logger = getLogger(); - log.info(`Rest server listening on: ${restServer.endPoint}`); - } catch (err) { - const log: Logger = getLogger(); - log.error(`${err.stack}`); - throw err; - } -}) -.catch((err: Error) => { - console.error(`Failed to create log dir: ${err.stack}`); -}); + try { + await initContainer(foreground, mode); + const restServer: NNIRestServer = component.get(NNIRestServer); + await restServer.start(); + const log: Logger = getLogger(); + log.info(`Rest server listening on: ${restServer.endPoint}`); + } catch (err) { + const log: Logger = getLogger(); + log.error(`${err.stack}`); + throw err; + } + }) + .catch((err: Error) => { + console.error(`Failed to create log dir: ${err.stack}`); + }); function getStopSignal(): any { if (process.platform === "win32") { return 'SIGBREAK'; } - else{ + else { return 'SIGTERM'; } } @@ -205,7 +205,7 @@ process.on(getStopSignal(), async () => { hasError = true; log.error(`${err.stack}`); } finally { - await log.close(); + // await log.close(); process.exit(hasError ? 1 : 0); } }); diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 06ed3f3f15..df4714e88f 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -104,6 +104,7 @@ export namespace ValidationSchemas { passWord: joi.string().min(1), token: joi.string().min(1), host: joi.string().min(1).required(), + reuse: joi.boolean(), }), kubeflow_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase operator: joi.string().min(1).required(), diff --git a/src/nni_manager/training_service/pai/paiConfig.ts b/src/nni_manager/training_service/pai/paiConfig.ts index 61742f378a..eceda619c7 100644 --- a/src/nni_manager/training_service/pai/paiConfig.ts +++ b/src/nni_manager/training_service/pai/paiConfig.ts @@ -10,6 +10,7 @@ export class PAIClusterConfig { public readonly passWord?: string; public host: string; public readonly token?: string; + public readonly reuse?: boolean; /** * Constructor @@ -17,12 +18,14 @@ export class PAIClusterConfig { * @param passWord password of PAI Cluster * @param host Host IP of PAI Cluster * @param token PAI token of PAI Cluster + * @param reuse If job is reusable for multiple trials */ - constructor(userName: string, host: string, passWord?: string, token?: string) { + constructor(userName: string, host: string, passWord?: string, token?: string, reuse?: boolean) { this.userName = userName; this.passWord = passWord; this.host = host; this.token = token; + this.reuse = reuse; } } diff --git a/src/nni_manager/training_service/pai/reusable/environment.ts b/src/nni_manager/training_service/pai/reusable/environment.ts new file mode 100644 index 0000000000..7b2c485cef --- /dev/null +++ b/src/nni_manager/training_service/pai/reusable/environment.ts @@ -0,0 +1,98 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../../common/trainingService"; +import { StorageService } from "./storage"; +import * as component from '../../../common/component'; + +export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; + +export abstract class EnvironmentService { + public abstract config(key: string, value: string): Promise; + public abstract updateEnvironmentsStatus(environment: EnvironmentInformation[]): Promise; + public abstract startEnvironment(environment: EnvironmentInformation): Promise; + public abstract stopEnvironment(environment: EnvironmentInformation): Promise; +} + +export class TrialDetail implements TrialJobDetail { + public id: string; + public status: TrialJobStatus; + public submitTime: number; + public startTime?: number; + public endTime?: number; + public tags?: string[]; + public url?: string; + public workingDirectory: string; + public form: TrialJobApplicationForm; + public isEarlyStopped?: boolean; + public environment?: EnvironmentInformation; + + private readonly TRIAL_METADATA_DIR = ".nni"; + + constructor(id: string, status: TrialJobStatus, submitTime: number, + workingDirectory: string, form: TrialJobApplicationForm) { + this.id = id; + this.status = status; + this.submitTime = submitTime; + this.workingDirectory = workingDirectory; + this.form = form; + this.tags = []; + } + + public getExitCodeFileName(): string { + const storageService = component.get(StorageService); + return storageService.joinRemotePath(this.workingDirectory, this.TRIAL_METADATA_DIR, "code"); + } +} + +export class RunnerSettings { + public experimentId: string = ""; + public platform: string = ""; + public nniManagerIP: string = ""; + public nniManagerPort: number = 8081; + public nniManagerVersion: string = ""; + public logCollection: string = "none"; + public command: string = ""; +} + +export class EnvironmentInformation { + // NNI environment ID + public id: string; + // training platform unique job ID. + public jobId: string; + // training platform job friendly name, in case it's different with job ID. + public jobName: string; + public isIdle: boolean = false; + public isEnd: boolean = false; + public trackingUrl: string = ""; + public status: EnvironmentStatus = "UNKNOWN"; + public workingFolder: string = ""; + public envWorkingFolder: string = ""; + public command: string = ""; + public serverCount: number = 1; + public currentTrialId: string = ""; + + constructor(id: string, jobName: string, jobId?: string) { + this.id = id; + this.jobName = jobName; + this.jobId = jobId ? jobId : jobName; + } +} diff --git a/src/nni_manager/training_service/pai/reusable/environmentManager.ts b/src/nni_manager/training_service/pai/reusable/environmentManager.ts new file mode 100644 index 0000000000..5c284ea4c6 --- /dev/null +++ b/src/nni_manager/training_service/pai/reusable/environmentManager.ts @@ -0,0 +1,446 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { EventEmitter } from 'events'; +import * as path from 'path'; +import * as component from '../../../common/component'; +import { getExperimentId, getPlatform } from '../../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../../common/log'; +import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric } from '../../../common/trainingService'; +import { delay, generateParamFileName, getVersion, uniqueString } from '../../../common/utils'; +import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../../core/commands'; +import { encodeCommand } from '../../../core/ipcInterface'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; +import { TrialConfig } from '../../common/trialConfig'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { validateCodeDir } from '../../common/util'; +import { EnvironmentInformation, EnvironmentService, RunnerSettings, TrialDetail } from './environment'; +import { JobRestServer } from './jobRestServer'; +import { StorageService } from './storage'; + +/** + * It uses to manage jobs on training platforms + * and expose trial as trial job to upper level. +**/ +@component.Singleton +class EnvironmentManager implements TrainingService { + + private readonly log: Logger; + private stopping: boolean = false; + + private jobRestServer: JobRestServer; + private readonly metricsEmitter: EventEmitter; + private versionCheck: boolean = true; + private readonly experimentId: string; + + private trialConfig: TrialConfig | undefined; + private runnerSettings: RunnerSettings; + + private readonly trials: Map; + private readonly environments: Map; + + constructor() { + this.log = getLogger(); + this.trials = new Map(); + this.environments = new Map(); + this.metricsEmitter = new EventEmitter(); + this.jobRestServer = new JobRestServer(this.metricsEmitter); + this.experimentId = getExperimentId(); + this.runnerSettings = new RunnerSettings(); + this.runnerSettings.experimentId = this.experimentId; + this.runnerSettings.platform = getPlatform(); + } + + public async listTrialJobs(): Promise { + const trials: TrialDetail[] = []; + + for (const key of this.trials.keys()) { + trials.push(await this.getTrialJob(key)); + } + + return trials; + } + + public async getTrialJob(trialJobId: string): Promise { + const trial: TrialDetail | undefined = this.trials.get(trialJobId); + if (trial === undefined) { + throw new Error(`trial job ${trialJobId} not found`); + } + + return trial; + } + + public async submitTrialJob(form: TrialJobApplicationForm): Promise { + if (this.trialConfig === undefined) { + throw new Error(`trialConfig not initialized!`); + } + + const storageService = component.get(StorageService); + const trialId: string = uniqueString(5); + + const trialWorkingFolder: string = storageService.joinRemotePath('trials', trialId); + const trialJobDetail: TrialDetail = new TrialDetail(trialId, "WAITING", Date.now(), trialWorkingFolder, form); + + this.trials.set(trialId, trialJobDetail); + + return trialJobDetail; + } + + // to support multi phase + public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { + const trialDetail = await this.getTrialJob(trialJobId); + + const storageService = component.get(StorageService); + const fileName = storageService.joinRemotePath(trialDetail.workingDirectory, generateParamFileName(form.hyperParameters)) + // Write file content ( parameter.cfg ) to working folders + await storageService.save(form.hyperParameters.value, fileName); + + return trialDetail; + } + + public async cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean | undefined): Promise { + const trial = await this.getTrialJob(trialJobId); + switch (trial.status) { + case "RUNNING": + case "WAITING": + case "UNKNOWN": + { + const environment = trial.environment; + if (environment) { + trial.isEarlyStopped = isEarlyStopped; + trial.status = trial.isEarlyStopped === true ? + 'EARLY_STOPPED' : 'USER_CANCELED'; + + await this.sendCommand(KILL_TRIAL_JOB, trialJobId, environment); + this.releaseEnvironment(trial); + } + } + break; + } + } + + public async run(): Promise { + + await this.jobRestServer.start(); + this.jobRestServer.setEnableVersionCheck = this.versionCheck; + this.log.info(`Environment Manager rest server listening on: ${this.jobRestServer.endPoint}`); + this.runnerSettings.nniManagerPort = this.jobRestServer.clusterRestServerPort; + + if (this.trialConfig === undefined) { + throw new Error(`trial config shouldn't be undefined in run()`); + } + + this.log.info(`Environment Manager copying code and settings.`); + const storageService = component.get(StorageService); + // Copy the compressed file to remoteDirectory and delete it + const codeDir = path.resolve(this.trialConfig.codeDir); + const codeFileName = await storageService.copyDirectory(codeDir, "envs", true); + storageService.renameRemote(codeFileName, "nni-code.tar.gz"); + + const installFileName = storageService.joinRemotePath("envs", 'install_nni.sh'); + await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName); + + const runnerSettings = storageService.joinRemotePath("envs", "settings.json"); + await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); + + this.log.info(`Environment Manager run loop started.`); + await Promise.all([ + this.environmentMaintenanceLoop(), + this.trialManagementLoop(), + ]); + } + + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + this.metricsEmitter.on('metric', listener); + } + + public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + this.metricsEmitter.off('metric', listener); + } + + public get isMultiPhaseJobSupported(): boolean { + return true; + } + + public async setClusterMetadata(key: string, value: string): Promise { + switch (key) { + case TrialConfigMetadataKey.NNI_MANAGER_IP: + this.runnerSettings.nniManagerIP = (JSON.parse(value)).nniManagerIp; + break; + case TrialConfigMetadataKey.VERSION_CHECK: + this.versionCheck = (value === 'true' || value === 'True'); + this.runnerSettings.nniManagerVersion = this.versionCheck ? await getVersion() : ''; + break; + case TrialConfigMetadataKey.LOG_COLLECTION: + this.runnerSettings.logCollection = value; + break; + case TrialConfigMetadataKey.MULTI_PHASE: + // not useful, dismiss it. + break; + case TrialConfigMetadataKey.TRIAL_CONFIG: + // TODO to support more storage types by better parameters. + this.trialConfig = JSON.parse(value); + + this.runnerSettings.command = this.trialConfig.command; + // Validate to make sure codeDir doesn't have too many files + await validateCodeDir(this.trialConfig.codeDir); + break; + } + const environmentService = component.get(EnvironmentService); + await environmentService.config(key, value); + } + + public getClusterMetadata(_key: string): Promise { + throw new Error('Not implemented!'); + } + + public async cleanUp(): Promise { + this.stopping = true; + const environmentService = component.get(EnvironmentService); + const environments = [...this.environments.values()]; + for (let index = 0; index < environments.length; index++) { + const environment = environments[index]; + if (environment.isEnd === false) { + this.log.info(`stopping environment ${environment.id}...`); + await environmentService.stopEnvironment(environment); + this.log.info(`stopped environment ${environment.id}.`); + } + } + + try { + await this.jobRestServer.stop(); + this.log.info('Rest server stopped successfully.'); + } catch (error) { + this.log.error(`Rest server stopped failed, error: ${error.message}`); + } + } + + private async sendCommand(commantType: string, data: any, environment: EnvironmentInformation): Promise { + let retryCount = 10; + let fileName: string; + let filePath: string = ""; + let findingName: boolean = true; + const command = encodeCommand(commantType, JSON.stringify(data)); + const storageService = component.get(StorageService); + const commandPath = storageService.joinRemotePath(environment.workingFolder, `commands`); + + while (findingName) { + fileName = `manager_command_${new Date().getTime()}.txt`; + filePath = storageService.joinRemotePath(commandPath, fileName); + if (!await storageService.existsRemote(filePath)) { + findingName = false; + break; + } + if (retryCount == 0) { + throw new Error(`EnvironmentManager retry too many times to send command!`); + } + retryCount--; + await delay(1); + } + + // prevent to have imcomplete command, so save as temp name and then rename. + await storageService.save(command.toString("utf8"), filePath); + } + + private async environmentMaintenanceLoop(): Promise { + const environmentService = component.get(EnvironmentService); + while (!this.stopping) { + const environments: EnvironmentInformation[] = []; + this.environments.forEach((environment) => { + if (environment.isEnd === false) { + environments.push(environment); + } + }); + environmentService.updateEnvironmentsStatus(environments); + + environments.forEach((environment) => { + switch (environment.status) { + case 'WAITING': + case 'RUNNING': + case 'UNKNOWN': + environment.isEnd = false; + break; + default: + this.log.debug(`set environment ${environment.jobId} ${environment.status} to ended`); + environment.isEnd = true; + break; + } + }); + await delay(5000); + } + } + + private async trialManagementLoop(): Promise { + const storageService = component.get(StorageService); + while (!this.stopping) { + const waitingTrials: TrialDetail[] = []; + let liveTrialsCount = 0; + const trials = this.trials.values(); + for (const trial of trials) { + const currentStatus = trial.status; + switch (currentStatus) { + case "RUNNING": + { + // check status consistence with environment. + const environment = trial.environment; + if (environment === undefined) { + this.log.error(`found running trial ${trial.id} has no environment, set trial to UNKNOWN.`); + trial.status = "UNKNOWN"; + } else if (environment.status !== "RUNNING") { + this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environment.status}', set trial to environment status.`); + this.releaseEnvironment(trial); + trial.status = environment.status; + } + + // check if it's done. + const fileName = trial.getExitCodeFileName(); + + if (await storageService.existsRemote(fileName) === true) { + const fileContent = await storageService.readRemoteFile(fileName); + const match: RegExpMatchArray | null = fileContent.trim() + .match(/^-?(\d+)\s+(\d+)$/); + if (match !== null) { + const { 1: code, 2: timestamp } = match; + + if (trial.status == currentStatus) { + // Update trial job's status based on result code + if (parseInt(code, 10) === 0) { + trial.status = 'SUCCEEDED'; + } else { + trial.status = 'FAILED'; + } + } + trial.endTime = parseInt(timestamp, 10); + this.releaseEnvironment(trial); + } else { + liveTrialsCount++; + } + } else { + liveTrialsCount++; + } + } + break; + case "WAITING": + case "UNKNOWN": + // deal it later, if there is free environment. + waitingTrials.push(trial); + liveTrialsCount++; + break; + } + } + + let liveEnvironmentsCount = 0; + const idleEnvironments: EnvironmentInformation[] = []; + this.environments.forEach((environment) => { + if (!environment.isEnd) { + liveEnvironmentsCount++; + if (environment.status === "RUNNING" && environment.isIdle) { + idleEnvironments.push(environment); + } + } + }); + + while (idleEnvironments.length > 0 && waitingTrials.length > 0) { + for (const trial of waitingTrials) { + const idleEnvironment = idleEnvironments.pop(); + if (idleEnvironment) { + await this.assignEnvironment(trial, idleEnvironment); + } + } + } + + if (liveEnvironmentsCount < liveTrialsCount) { + this.log.info(`request new environment, since live trials ${liveTrialsCount} ` + + `is more than live environments ${liveEnvironmentsCount}`); + for (let index = 0; index < liveTrialsCount - liveEnvironmentsCount; index++) { + await this.requestEnvironment(); + } + } + await delay(2000); + } + } + + private async requestEnvironment(): Promise { + const environmentService = component.get(EnvironmentService); + const storageService = component.get(StorageService); + const envId = uniqueString(5); + const name = `nni_exp_${this.experimentId}_env_${envId}`; + const environment = new EnvironmentInformation(envId, name); + + environment.workingFolder = storageService.joinRemotePath("envs", envId); + environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; + + await storageService.createDirectory(environment.workingFolder); + + const isDebuging = true; + if (isDebuging) { + // environment.status = "RUNNING"; + await storageService.copyDirectory("D:\\code\\nni\\tools\\nni_trial_tool", environment.workingFolder); + } + + this.environments.set(environment.id, environment); + await environmentService.startEnvironment(environment); + + if (environment.status === "FAILED") { + environment.isIdle = false; + environment.isEnd = true; + throw new Error(`error on request environment ${environment.jobId}, please check log for more details.`); + } else { + environment.isIdle = true; + environment.isEnd = false; + } + this.log.info(`requested environment ${environment.id} and job id is ${environment.jobId}.`); + } + + private async assignEnvironment(trial: TrialDetail, environment: EnvironmentInformation): Promise { + if (trial.environment) { + throw new Error(`trial ${trial.id} has assigned environment ${environment.id} already!`); + } + if (environment.isIdle == false) { + throw new Error(`environment ${environment.id} is not idle, and cannot be assigned again!`); + } + this.log.info(`assigning environment ${environment.id} to trial ${trial.id}.`); + + environment.isIdle = false; + trial.environment = environment; + const settings = { + trialId: trial.id, + sequenceId: trial.form.sequenceId, + parameter: trial.form.hyperParameters, + } + trial.startTime = Date.now(); + trial.status = "RUNNING"; + await this.sendCommand(NEW_TRIAL_JOB, settings, environment); + } + + private releaseEnvironment(trial: TrialDetail): void { + if (!trial.environment) { + throw new Error(`environment is not assigned to trial ${trial.id}, and cannot be released!`); + } + if (trial.environment.isIdle) { + throw new Error(`environment ${trial.environment.id} is idle already!`); + } + trial.environment.isIdle = true; + trial.environment = undefined; + } +} + +export { EnvironmentManager }; diff --git a/src/nni_manager/training_service/pai/reusable/jobRestServer.ts b/src/nni_manager/training_service/pai/reusable/jobRestServer.ts new file mode 100644 index 0000000000..133d80942d --- /dev/null +++ b/src/nni_manager/training_service/pai/reusable/jobRestServer.ts @@ -0,0 +1,88 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { EventEmitter } from 'events'; +import { Request, Response, Router } from 'express'; +import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; + +export interface ParameterFileMeta { + readonly experimentId: string; + readonly trialId: string; + readonly filePath: string; +} + +/** + * TODO: it should be merged into ClusterJobRestServer + */ +export class JobRestServer extends ClusterJobRestServer { + protected parameterFileMetaList: ParameterFileMeta[] = []; + + protected readonly metricsEmitter: EventEmitter; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor(metricsEmitter: EventEmitter) { + super(); + this.metricsEmitter = metricsEmitter; + this.setEnableVersionCheck = true; + } + + protected handleTrialMetrics(jobId: string, metrics: any[]): void { + // Split metrics array into single metric, then emit + // Warning: If not split metrics into single ones, the behavior will be UNKNOWN + for (const singleMetric of metrics) { + this.metricsEmitter.emit('metric', { + id: jobId, + data: singleMetric + }); + } + } + + protected createRestHandler(): Router { + const router: Router = super.createRestHandler(); + + router.post(`/parameter-file-meta`, (req: Request, res: Response) => { + try { + this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); + this.parameterFileMetaList.push(req.body); + res.send(); + } catch (err) { + this.log.error(`POST parameter-file-meta error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + router.get(`/parameter-file-meta`, (req: Request, res: Response) => { + try { + this.log.info(`GET /parameter-file-meta`); + res.send(this.parameterFileMetaList); + } catch (err) { + this.log.error(`GET parameter-file-meta error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + return router; + } +} diff --git a/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts b/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts new file mode 100644 index 0000000000..5dd0c380fa --- /dev/null +++ b/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts @@ -0,0 +1,147 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +import * as fs from 'fs'; +import * as path from 'path'; +import { Deferred } from "ts-deferred"; +import { StorageService } from "./storage"; + +export class MountedStorageService extends StorageService { + + protected config(_key: string, _value: string): void { + // nothing to config + } + + protected async remove(path: string, isDirectory: boolean, isRecursive: boolean): Promise { + if (isDirectory) { + if (isRecursive) { + const children = await fs.promises.readdir(path); + for (const file of children) { + const stat = await fs.promises.lstat(file); + this.remove(file, stat.isDirectory(), isRecursive); + } + } else { + await fs.promises.rmdir(path); + } + } else { + await fs.promises.unlink(path); + } + } + + protected async rename(remotePath: string, newName: string): Promise { + const dirName = path.dirname(remotePath); + newName = this.joinPath(dirName, newName); + + await fs.promises.rename(remotePath, newName); + } + + protected async mkdir(remotePath: string): Promise { + if (!fs.existsSync(remotePath)) { + await fs.promises.mkdir(remotePath, { recursive: true }); + } + } + + protected async copy(localPath: string, remotePath: string, isDirectory: boolean, isToRemote: boolean): Promise { + if (localPath === remotePath) { + return remotePath; + } + const sourcePath = isToRemote ? localPath : remotePath; + let targetPath = isToRemote ? remotePath : localPath; + + this.logger.debug(`copying ${sourcePath} to ${targetPath}, dir ${isDirectory}, isRemote: ${isToRemote}`); + if (isDirectory) { + if (isToRemote) { + targetPath = this.joinPath(targetPath, this.basename(localPath)); + } else { + targetPath = path.join(targetPath, this.basename(remotePath)) + } + await this.mkdir(targetPath); + const children = await fs.promises.readdir(sourcePath); + for (const child of children) { + const childSourcePath = this.joinPath(sourcePath, child); + const stat = await fs.promises.lstat(childSourcePath); + // true: the source and target is aligned already, so always set isToRemote to true. + this.copy(childSourcePath, targetPath, stat.isDirectory(), true); + } + return targetPath; + } else { + // This behavior may not be consistent for each platform, but it needs to correct to same + await this.mkdir(targetPath); + const targetFileName = path.join(targetPath, path.basename(sourcePath)); + await fs.promises.copyFile(sourcePath, targetFileName); + return targetFileName; + } + } + + protected async exists(remotePath: string): Promise { + const deferred = new Deferred(); + fs.exists(remotePath, (exists) => { + deferred.resolve(exists); + }); + return deferred.promise; + } + + protected async read(remotePath: string, offset?: number, length?: number): Promise { + const deferred = new Deferred(); + // set a max length to 1MB for performance concern. + const maxLength = 1024 * 1024; + if (offset === undefined) { + offset = -1; + } + const current: number = offset < 0 ? 0 : offset; + if (length === undefined) { + length = -1; + } + const readLength: number = length < 0 ? maxLength : length; + let result: string = ""; + + const stream = fs.createReadStream(remotePath, + { + encoding: "utf8", + start: current, + end: readLength + current, + }).on("data", (data) => { + result += data; + }).on("end", () => { + stream.close(); + deferred.resolve(result); + }).on("error", (err) => { + deferred.reject(err); + }); + + return deferred.promise; + + } + + protected isRelativePath(remotePath: string): boolean { + return !path.isAbsolute(remotePath); + } + + protected joinPath(...paths: string[]): string { + return path.join(...paths); + } + + protected dirname(remotePath: string): string { + return path.dirname(remotePath); + } + + protected basename(remotePath: string): string { + return path.basename(remotePath); + } +} diff --git a/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts new file mode 100644 index 0000000000..5422cdef35 --- /dev/null +++ b/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts @@ -0,0 +1,410 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as fs from 'fs'; +import * as request from 'request'; +import { Deferred } from 'ts-deferred'; +import * as component from '../../../common/component'; +import { getExperimentId } from '../../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../../common/log'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { PAIClusterConfig } from '../paiConfig'; +import { NNIPAIK8STrialConfig } from '../paiK8S/paiK8SConfig'; +import { EnvironmentInformation, EnvironmentService } from './environment'; +import { StorageService } from './storage'; + +const yaml = require('js-yaml'); + +/** + * Collector PAI jobs info from PAI cluster, and update pai job status locally + */ +@component.Singleton +export class OpenPaiEnvironmentService implements EnvironmentService { + + private readonly log: Logger = getLogger(); + private paiClusterConfig: PAIClusterConfig | undefined; + private paiTrialConfig: NNIPAIK8STrialConfig | undefined; + private paiJobConfig: any; + private paiToken?: string; + private paiTokenUpdateTime?: number; + private readonly paiTokenUpdateInterval: number; + private protocol: string = 'http'; + + private experimentId: string; + + constructor() { + this.paiTokenUpdateInterval = 7200000; //2hours + this.experimentId = getExperimentId(); + } + + public async updateEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { + const deferred: Deferred = new Deferred(); + await this.refreshPlatform(); + + if (this.paiClusterConfig === undefined) { + throw new Error('PAI Cluster config is not initialized'); + } + if (this.paiToken === undefined) { + throw new Error('PAI token is not initialized'); + } + + const getJobInfoRequest: request.Options = { + uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs?username=${this.paiClusterConfig.userName}`, + method: 'GET', + json: true, + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.paiToken}` + } + }; + + request(getJobInfoRequest, async (error: any, response: request.Response, body: any) => { + if ((error !== undefined && error !== null) || response.statusCode >= 400) { + this.log.error(`PAI Training service: get environment info from PAI Cluster failed!\nerror: ${error}`); + deferred.reject(error); + } else { + const jobInfos = new Map(); + body.forEach((jobInfo: any) => { + jobInfos.set(jobInfo.name, jobInfo); + }); + + environments.forEach((environment) => { + if (jobInfos.has(environment.jobId)) { + const jobResponse = jobInfos.get(environment.jobId); + if (jobResponse && jobResponse.state) { + switch (jobResponse.state) { + case 'WAITING': + case 'RUNNING': + case 'SUCCEEDED': + case 'FAILED': + environment.status = jobResponse.state; + break; + case 'STOPPED': + case 'STOPPING': + environment.status = 'USER_CANCELED'; + break; + default: + this.log.error(`OpenPAI: job ${environment.jobId} returns unknown state ${jobResponse.state}.`); + environment.status = 'UNKNOWN'; + } + } else { + this.log.error(`OpenPAI: job ${environment.jobId} has no state returned. body:${JSON.stringify(jobResponse)}`); + // some error happens, and mark this environment + environment.status = 'FAILED'; + } + } else { + this.log.error(`OpenPAI job ${environment.jobId} is not found in job list.`); + environment.status = 'UNKNOWN'; + } + }); + deferred.resolve(); + } + }); + return deferred.promise; + } + + public async startEnvironment(environment: EnvironmentInformation): Promise { + const deferred: Deferred = new Deferred(); + + await this.refreshPlatform(); + + if (this.paiClusterConfig === undefined) { + throw new Error('PAI Cluster config is not initialized'); + } + if (this.paiToken === undefined) { + throw new Error('PAI token is not initialized'); + } + if (this.paiTrialConfig === undefined) { + throw new Error('PAI trial config is not initialized'); + } + + // Step 1. Prepare PAI job configuration + environment.envWorkingFolder = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/envs/${environment.id}`; + environment.command = `cd ${environment.envWorkingFolder} && ${environment.command}` + environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.jobId}` + + // Generate Job Configuration in yaml format + const paiJobConfig = this.generateJobConfigInYamlFormat(environment); + this.log.debug(`generated paiJobConfig: ${paiJobConfig}`); + // Step 2. Submit PAI job via Rest call + const submitJobRequest: request.Options = { + uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, + method: 'POST', + body: paiJobConfig, + headers: { + 'Content-Type': 'text/yaml', + Authorization: `Bearer ${this.paiToken}` + } + }; + request(submitJobRequest, (error, response, body) => { + if ((error !== undefined && error !== null) || response.statusCode >= 400) { + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `start environment ${environment.jobId} failed, http code:${response.statusCode}, http body: ${body}`; + + this.log.error(errorMessage); + environment.status = 'FAILED'; + } + deferred.resolve(); + }); + + return deferred.promise; + } + + public async stopEnvironment(environment: EnvironmentInformation): Promise { + const deferred: Deferred = new Deferred(); + + if (this.paiClusterConfig === undefined) { + return Promise.reject(new Error('PAI Cluster config is not initialized')); + } + if (this.paiToken === undefined) { + return Promise.reject(Error('PAI token is not initialized')); + } + + const stopJobRequest: request.Options = { + uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs/${this.paiClusterConfig.userName}~${environment.jobId}/executionType`, + method: 'PUT', + json: true, + body: { value: 'STOP' }, + time: true, + headers: { + 'Content-Type': 'application/json', + Authorization: `Bearer ${this.paiToken}` + } + }; + + this.log.debug(`stopping OpenPAI environment ${environment.jobId}, ${stopJobRequest.uri}`); + + try { + request(stopJobRequest, (error, response, _body) => { + try { + if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) { + this.log.error(`OpenPAI Training service: stop job ${environment.jobId} failed with ${response.statusCode}\n${error}`); + deferred.reject((error !== undefined && error !== null) ? error : + `Stop trial failed, http code: ${response.statusCode}`); + } else { + this.log.info(`OpenPAI job ${environment.jobId} stopped, body: ${response.body}.`); + } + deferred.resolve(); + } catch (error) { + this.log.error(`OpenPAI error when inner stopping environment ${error}`); + deferred.reject(error); + } + }); + } catch (error) { + this.log.error(`OpenPAI error when stopping environment ${error}`); + return Promise.reject(error); + } + + return deferred.promise; + } + + public async config(key: string, value: string): Promise { + switch (key) { + case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: + this.paiClusterConfig = JSON.parse(value); + this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); + if (this.paiClusterConfig.passWord) { + // Get PAI authentication token + await this.updatePaiToken(); + } else if (this.paiClusterConfig.token) { + this.paiToken = this.paiClusterConfig.token; + } + break; + + case TrialConfigMetadataKey.TRIAL_CONFIG: { + if (this.paiClusterConfig === undefined) { + this.log.error('pai cluster config is not initialized'); + break; + } + this.paiTrialConfig = JSON.parse(value); + // Validate to make sure codeDir doesn't have too many files + + const storageService = component.get(StorageService); + const remoteRoot = storageService.joinRemotePath(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId); + storageService.initialize(this.paiTrialConfig.nniManagerNFSMountPath, remoteRoot); + + if (this.paiTrialConfig.paiConfigPath) { + this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); + } + break; + } + case TrialConfigMetadataKey.MULTI_PHASE: + break; + default: + //Reject for unknown keys + this.log.debug(`OpenPAI not proccessed metadata key: '${key}', value: '${value}'`); + } + } + + private async refreshPlatform(): Promise { + if (this.paiClusterConfig && this.paiClusterConfig.passWord) { + try { + await this.updatePaiToken(); + } catch (error) { + this.log.error(`${error}`); + if (this.paiToken === undefined) { + throw new Error(error); + } + } + } + } + + private generateJobConfigInYamlFormat(environment: EnvironmentInformation): any { + if (this.paiTrialConfig === undefined) { + throw new Error('trial config is not initialized'); + } + const jobName = environment.jobId; + + let nniJobConfig: any = undefined; + if (this.paiTrialConfig.paiConfigPath) { + nniJobConfig = JSON.parse(JSON.stringify(this.paiJobConfig)); //Trick for deep clone in Typescript + nniJobConfig.name = jobName; + if (nniJobConfig.taskRoles) { + environment.serverCount = nniJobConfig.taskRoles.length; + + // Each taskRole will generate new command in NNI's command format + // Each command will be formatted to NNI style + for (const taskRoleIndex in nniJobConfig.taskRoles) { + const commands = nniJobConfig.taskRoles[taskRoleIndex].commands + const nniTrialCommand = `${environment.command} ${commands.join(" && ").replace(/(["'$`\\])/g, '\\$1')}`; + nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand] + } + } + + } else { + nniJobConfig = { + protocolVersion: 2, + name: jobName, + type: 'job', + jobRetryCount: 0, + prerequisites: [ + { + type: 'dockerimage', + uri: this.paiTrialConfig.image, + name: 'docker_image_0' + } + ], + taskRoles: { + taskrole: { + instances: 1, + completion: { + minFailedInstances: 1, + minSucceededInstances: -1 + }, + taskRetryCount: 0, + dockerImage: 'docker_image_0', + resourcePerInstance: { + gpu: this.paiTrialConfig.gpuNum, + cpu: this.paiTrialConfig.cpuNum, + memoryMB: this.paiTrialConfig.memoryMB + }, + commands: [ + environment.command + ] + } + }, + extras: { + 'com.microsoft.pai.runtimeplugin': [ + { + plugin: this.paiTrialConfig.paiStoragePlugin + } + ], + submitFrom: 'submit-job-v2' + } + } + if (this.paiTrialConfig.virtualCluster) { + nniJobConfig.defaults = { + virtualCluster: this.paiTrialConfig.virtualCluster + } + } + } + return yaml.safeDump(nniJobConfig); + } + + protected formatPAIHost(host: string): string { + // If users' host start with 'http://' or 'https://', use the original host, + // or format to 'http//${host}' + if (host.startsWith('http://')) { + this.protocol = 'http'; + return host.replace('http://', ''); + } else if (host.startsWith('https://')) { + this.protocol = 'https'; + return host.replace('https://', ''); + } else { + return host; + } + } + /** + * Update pai token by the interval time or initialize the pai token + */ + protected async updatePaiToken(): Promise { + const deferred: Deferred = new Deferred(); + + const currentTime: number = new Date().getTime(); + //If pai token initialized and not reach the interval time, do not update + if (this.paiTokenUpdateTime !== undefined && (currentTime - this.paiTokenUpdateTime) < this.paiTokenUpdateInterval) { + return Promise.resolve(); + } + + if (this.paiClusterConfig === undefined) { + const paiClusterConfigError: string = `pai cluster config not initialized!`; + this.log.error(`${paiClusterConfigError}`); + throw Error(`${paiClusterConfigError}`); + } + + const authenticationReq: request.Options = { + uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v1/token`, + method: 'POST', + json: true, + body: { + username: this.paiClusterConfig.userName, + password: this.paiClusterConfig.passWord + } + }; + + request(authenticationReq, (error: any, response: request.Response, body: any) => { + if (error !== undefined && error !== null) { + this.log.error(`Get PAI token failed: ${error.message}, authenticationReq: ${authenticationReq}`); + deferred.reject(new Error(`Get PAI token failed: ${error.message}`)); + } else { + if (response.statusCode !== 200) { + this.log.error(`Get PAI token failed: get PAI Rest return code ${response.statusCode}, authenticationReq: ${authenticationReq}`); + deferred.reject(new Error(`Get PAI token failed code: ${response.statusCode}, body: ${response.body}, authenticationReq: ${authenticationReq}, please check paiConfig username or password`)); + } else { + this.paiToken = body.token; + this.paiTokenUpdateTime = new Date().getTime(); + deferred.resolve(); + } + } + }); + + let timeoutId: NodeJS.Timer; + const timeoutDelay: Promise = new Promise((_resolve: Function, reject: Function): void => { + // Set timeout and reject the promise once reach timeout (5 seconds) + timeoutId = setTimeout( + () => reject(new Error('Get PAI token timeout. Please check your PAI cluster.')), + 5000); + }); + + return Promise.race([timeoutDelay, deferred.promise]) + .finally(() => { clearTimeout(timeoutId); }); + } +} diff --git a/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts b/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts new file mode 100644 index 0000000000..8668fa4b1d --- /dev/null +++ b/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts @@ -0,0 +1,173 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as component from '../../../common/component'; +import { getLogger, Logger } from '../../../common/log'; +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../../common/trainingService'; +import { delay } from '../../../common/utils'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { PAIClusterConfig } from '../paiConfig'; +import { PAIK8STrainingService } from '../paiK8S/paiK8STrainingService'; +import { EnvironmentManager } from './environmentManager'; +import { Container, Scope } from 'typescript-ioc'; +import { EnvironmentService } from './environment'; +import { OpenPaiEnvironmentService } from './openPaiEnvironmentService'; +import { StorageService } from './storage'; +import { MountedStorageService } from './mountedStorageService'; + + +/** + * It's a intermedia implementation to support reusable training service. + * The final goal is to support reusable training job in higher level than training service. + */ +@component.Singleton +class ReusableTrainingService implements TrainingService { + protected readonly log!: Logger; + private internalTrainingService: TrainingService | undefined; + private metaDataCache: Map = new Map(); + + constructor() { + this.log = getLogger(); + } + + public async listTrialJobs(): Promise { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + return await this.internalTrainingService.listTrialJobs(); + } + + public async getTrialJob(trialJobId: string): Promise { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + return await this.internalTrainingService.getTrialJob(trialJobId); + } + + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + this.internalTrainingService.addTrialJobMetricListener(listener); + } + + public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + this.internalTrainingService.removeTrialJobMetricListener(listener); + } + + public async submitTrialJob(form: TrialJobApplicationForm): Promise { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + return await this.internalTrainingService.submitTrialJob(form); + } + + public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + return await this.internalTrainingService.updateTrialJob(trialJobId, form); + } + + public get isMultiPhaseJobSupported(): boolean { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + return this.internalTrainingService.isMultiPhaseJobSupported; + } + + public async cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean | undefined): Promise { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + await this.internalTrainingService.cancelTrialJob(trialJobId, isEarlyStopped); + } + + public async setClusterMetadata(key: string, value: string): Promise { + if (this.internalTrainingService === undefined) { + if (key === TrialConfigMetadataKey.PAI_CLUSTER_CONFIG) { + const config = JSON.parse(value); + if (config.reuse === true) { + this.log.info(`reuse flag enabled, use EnvironmentManager.`); + this.internalTrainingService = component.get(EnvironmentManager); + + // TODO to support other serivces later. + Container.bind(EnvironmentService) + .to(OpenPaiEnvironmentService) + .scope(Scope.Singleton); + // TODO to support other storages later. + Container.bind(StorageService) + .to(MountedStorageService) + .scope(Scope.Singleton); + } else { + this.log.debug(`caching metadata key:{} value:{}, as training service is not determined.`); + this.internalTrainingService = component.get(PAIK8STrainingService); + } + for (const [key, value] of this.metaDataCache) { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + await this.internalTrainingService.setClusterMetadata(key, value); + } + + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + await this.internalTrainingService.setClusterMetadata(key, value); + + this.metaDataCache.clear(); + } else { + this.log.debug(`caching metadata key:{} value:{}, as training service is not determined.`); + this.metaDataCache.set(key, value); + } + } else { + await this.internalTrainingService.setClusterMetadata(key, value); + } + } + + public async getClusterMetadata(key: string): Promise { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + return await this.internalTrainingService.getClusterMetadata(key); + } + + public async cleanUp(): Promise { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + await this.internalTrainingService.cleanUp(); + } + + public async run(): Promise { + // wait internal training service is assigned. + // It will be assigned after set metadata of paiConfig + while (this.internalTrainingService === undefined) { + await delay(100); + } + return await this.internalTrainingService.run(); + } +} + +export { ReusableTrainingService }; diff --git a/src/nni_manager/training_service/pai/reusable/storage.ts b/src/nni_manager/training_service/pai/reusable/storage.ts new file mode 100644 index 0000000000..35071085ee --- /dev/null +++ b/src/nni_manager/training_service/pai/reusable/storage.ts @@ -0,0 +1,184 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { uniqueString } from '../../../common/utils'; +import * as fs from 'fs'; +import * as os from 'os'; +import * as path from 'path'; +import { Logger, getLogger } from '../../../common/log'; +import { tarAdd } from '../../common/util'; + +export abstract class StorageService { + protected localRoot: string = ""; + protected remoteRoot: string = ""; + protected logger: Logger; + + protected abstract config(key: string, value: string): void; + protected abstract async remove(remotePath: string, isDirectory: boolean, isRecursive: boolean): Promise; + protected abstract async rename(remotePath: string, newName: string): Promise; + protected abstract async mkdir(remotePath: string): Promise; + protected abstract async copy(localPath: string, remotePath: string, isDirectory: boolean, isToRemote: boolean): Promise; + protected abstract async exists(remotePath: string): Promise; + protected abstract async read(remotePath: string, offset: number, length: number): Promise; + protected abstract isRelativePath(path: string): boolean; + protected abstract joinPath(...paths: string[]): string; + protected abstract dirname(...paths: string[]): string; + protected abstract basename(...paths: string[]): string; + + constructor() { + this.logger = getLogger(); + } + + public initialize(localRoot: string, remoteRoot: string): void { + this.logger.debug(`Initializing storage to local: ${localRoot} remote: ${remoteRoot}`); + this.localRoot = localRoot; + this.remoteRoot = remoteRoot; + } + + public async renameRemote(remotePath: string, newName: string): Promise { + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`rename remotePath: ${remotePath} to: ${newName}`); + await this.rename(remotePath, newName); + } + + public async createDirectory(remotePath: string): Promise { + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`create remotePath: ${remotePath}`); + await this.mkdir(remotePath); + } + + public async copyDirectory(localPath: string, remotePath: string, asGzip: boolean = false): Promise { + localPath = this.expandPath(false, localPath); + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`copy localPath: ${localPath} to remotePath: ${remotePath}, asGzip ${asGzip}`); + if (!await this.existsRemote(remotePath)) { + await this.mkdir(remotePath); + } + + if (asGzip) { + const localPathBaseName = path.basename(localPath); + const tempTarFileName = `nni_tmp_${localPathBaseName}_${uniqueString(5)}.tar.gz`; + const tarFileName = `${localPathBaseName}.tar.gz`; + const localTarPath: string = path.join(os.tmpdir(), tempTarFileName); + await tarAdd(localTarPath, localPath); + await this.copy(localTarPath, remotePath, false, true); + const remoteFileName = this.joinPath(remotePath, tempTarFileName); + await this.rename(remoteFileName, tarFileName); + await fs.promises.unlink(localTarPath); + + remotePath = this.joinPath(remotePath, tarFileName); + } else { + await this.copy(localPath, remotePath, true, true); + remotePath = this.joinPath(remotePath, path.basename(localPath)); + } + + return remotePath; + } + + public async copyDirectoryBack(remotePath: string, localPath: string): Promise { + localPath = this.expandPath(false, localPath); + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`copy remotePath: ${remotePath} to localPath: ${localPath}`); + return await this.copy(localPath, remotePath, true, false); + } + + public async removeDirectory(remotePath: string, isRecursive: boolean): Promise { + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`remove remotePath: ${remotePath}`); + await this.remove(remotePath, true, isRecursive); + } + + public async readRemoteFile(remotePath: string, offset: number = -1, length: number = -1): Promise { + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`read remote file: ${remotePath}, offset: ${offset}, length: ${length}`); + return this.read(remotePath, offset, length); + } + + public async existsRemote(remotePath: string): Promise { + remotePath = this.expandPath(true, remotePath); + const exists = await this.exists(remotePath); + this.logger.debug(`check exists remotePath: ${remotePath} is ${exists}`); + return exists + } + + public async save(content: string, remotePath: string): Promise { + this.logger.debug(`save content to remotePath: ${remotePath}, length: ${content.length}`); + const fileName = this.basename(remotePath); + const tempFileName = `temp_${uniqueString(4)}_${fileName}`; + + remotePath = this.expandPath(true, remotePath); + const localTempFileName = path.join(os.tmpdir(), tempFileName); + + const remoteDir = this.dirname(remotePath); + const remoteTempFile = this.joinPath(remoteDir, tempFileName); + + if (await this.exists(remotePath) === true) { + await this.remove(remotePath, false, false); + } + await fs.promises.writeFile(localTempFileName, content); + await this.copy(localTempFileName, remoteDir, false, true); + await this.renameRemote(remoteTempFile, fileName); + await fs.promises.unlink(localTempFileName); + } + + public async copyFile(localPath: string, remotePath: string): Promise { + localPath = this.expandPath(false, localPath); + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`copy file localPath: ${localPath} to remotePath: ${remotePath}`); + await this.copy(localPath, remotePath, false, true); + } + + public async copyFileBack(remotePath: string, localPath: string): Promise { + localPath = this.expandPath(false, localPath); + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`copy file remotePath: ${remotePath} to localPath: ${localPath}`); + await this.copy(localPath, remotePath, false, false); + } + + public async removeFile(remotePath: string): Promise { + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`remove file remotePath: ${remotePath}`); + await this.remove(remotePath, false, false); + } + + public joinRemotePath(...paths: string[]): string { + let fullPath = this.joinPath(...paths); + if (this.isRelativePath(fullPath) === true && this.remoteRoot !== "") { + fullPath = this.joinPath(this.remoteRoot, fullPath); + } + return fullPath; + } + + private expandPath(isRemote: boolean, ...paths: string[]): string { + let normalizedPath: string; + + if (isRemote) { + normalizedPath = this.joinRemotePath(...paths); + } else { + normalizedPath = path.join(...paths); + if (!path.isAbsolute(normalizedPath) && this.localRoot !== "") { + normalizedPath = path.join(this.localRoot, normalizedPath); + } + } + + return normalizedPath; + } +} diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index e85c4a5e2d..533bb79e21 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -411,7 +411,8 @@ class RemoteMachineTrainingService implements TrainingService { const rmMetaList: RemoteMachineMeta[] = JSON.parse(machineList); let connectedRMNum: number = 0; - rmMetaList.forEach(async (rmMeta: RemoteMachineMeta) => { + const connectionPromises = []; + for (const rmMeta of rmMetaList) { rmMeta.occupiedGpuIndexMap = new Map(); const executorManager: ExecutorManager = new ExecutorManager(rmMeta); this.log.info(`connecting to ${rmMeta.username}@${rmMeta.ip}:${rmMeta.port}`); @@ -419,12 +420,13 @@ class RemoteMachineTrainingService implements TrainingService { this.log.debug(`reached ${executor.name}`); this.machineExecutorManagerMap.set(rmMeta, executorManager); this.log.debug(`initializing ${executor.name}`); - await this.initRemoteMachineOnConnected(rmMeta, executor); + connectionPromises.push(this.initRemoteMachineOnConnected(rmMeta, executor)); this.log.info(`connected to ${executor.name}`); if (++connectedRMNum === rmMetaList.length) { deferred.resolve(); } - }); + } + Promise.all(connectionPromises); return deferred.promise; } @@ -460,7 +462,7 @@ class RemoteMachineTrainingService implements TrainingService { this.timer.unsubscribe(disposable); } } - if (this.stopping){ + if (this.stopping) { this.timer.unsubscribe(disposable); this.log.debug(`Stopped GPU collector on ${rmMeta.ip}, since experiment is exiting.`); } diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index 392235d0cf..8e98710965 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -302,11 +302,13 @@ def setPathCheck(key): 'paiConfig': Or({ 'userName': setType('userName', str), 'passWord': setType('passWord', str), - 'host': setType('host', str) + 'host': setType('host', str), + Optional('reuse'): setType('reuse', bool) }, { 'userName': setType('userName', str), 'token': setType('token', str), - 'host': setType('host', str) + 'host': setType('host', str), + Optional('reuse'): setType('reuse', bool) }) } diff --git a/tools/nni_cmd/rest_utils.py b/tools/nni_cmd/rest_utils.py index 6fcc978b12..e98c9a8392 100644 --- a/tools/nni_cmd/rest_utils.py +++ b/tools/nni_cmd/rest_utils.py @@ -51,7 +51,7 @@ def rest_delete(url, timeout, show_error=False): def check_rest_server(rest_port): '''Check if restful server is ready''' - retry_count = 5 + retry_count = 20 for _ in range(retry_count): response = rest_get(check_status_url(rest_port), REST_TIME_OUT) if response: @@ -60,7 +60,7 @@ def check_rest_server(rest_port): else: return False, response else: - time.sleep(3) + time.sleep(1) return False, response def check_rest_server_quick(rest_port): diff --git a/tools/nni_trial_tool/log_utils.py b/tools/nni_trial_tool/log_utils.py index 20c8e74c09..13e5141985 100644 --- a/tools/nni_trial_tool/log_utils.py +++ b/tools/nni_trial_tool/log_utils.py @@ -19,6 +19,7 @@ from .rest_utils import rest_post from .url_utils import gen_send_stdout_url + @unique class LogType(Enum): Trace = 'TRACE' @@ -28,23 +29,27 @@ class LogType(Enum): Error = 'ERROR' Fatal = 'FATAL' + @unique class StdOutputType(Enum): Stdout = 'stdout', Stderr = 'stderr' + def nni_log(log_type, log_message): '''Log message into stdout''' dt = datetime.now() print('[{0}] {1} {2}'.format(dt, log_type.value, log_message), flush=True) + class NNIRestLogHanlder(StreamHandler): - def __init__(self, host, port, tag, std_output_type=StdOutputType.Stdout): + def __init__(self, host, port, tag, trial_id, std_output_type=StdOutputType.Stdout): StreamHandler.__init__(self) self.host = host self.port = port self.tag = tag self.std_output_type = std_output_type + self.trial_id = trial_id self.orig_stdout = sys.__stdout__ self.orig_stderr = sys.__stderr__ @@ -55,24 +60,27 @@ def emit(self, record): log_entry['msg'] = self.format(record) try: - rest_post(gen_send_stdout_url(self.host, self.port), json.dumps(log_entry), 10, True) + rest_post(gen_send_stdout_url(self.host, self.port, self.trial_id), json.dumps(log_entry), 10, True) except Exception as e: self.orig_stderr.write(str(e) + '\n') self.orig_stderr.flush() + class RemoteLogger(object): """ NNI remote logger """ - def __init__(self, syslog_host, syslog_port, tag, std_output_type, log_collection, log_level=logging.INFO): + + def __init__(self, syslog_host, syslog_port, tag, std_output_type, log_collection, trial_id=None, log_level=logging.INFO): ''' constructor ''' self.logger = logging.getLogger('nni_syslog_{}'.format(tag)) self.log_level = log_level self.logger.setLevel(self.log_level) - handler = NNIRestLogHanlder(syslog_host, syslog_port, tag) - self.logger.addHandler(handler) + self.pipeReader = None + self.handler = NNIRestLogHanlder(syslog_host, syslog_port, tag, trial_id) + self.logger.addHandler(self.handler) if std_output_type == StdOutputType.Stdout: self.orig_stdout = sys.__stdout__ else: @@ -83,7 +91,8 @@ def get_pipelog_reader(self): ''' Get pipe for remote logger ''' - return PipeLogReader(self.logger, self.log_collection, logging.INFO) + self.pipeReader = PipeLogReader(self.logger, self.log_collection, logging.INFO) + return self.pipeReader def flush(self): ''' @@ -104,10 +113,22 @@ def write(self, buf): except Exception: pass + def close(self): + ''' + Close handlers and resources + ''' + if self.pipeReader is not None: + self.pipeReader.set_process_exit() + for handler in self.logger.handlers: + handler.close() + self.logger.removeHandler(handler) + + class PipeLogReader(threading.Thread): """ The reader thread reads log data from pipe """ + def __init__(self, logger, log_collection, log_level=logging.INFO): """Setup the object with a logger and a loglevel and start the thread @@ -129,7 +150,7 @@ def _populateQueue(stream, queue): ''' Collect lines from 'stream' and put them in 'quque'. ''' - time.sleep(5) + time.sleep(1) while True: cur_process_exit = self.process_exit try: diff --git a/tools/nni_trial_tool/protocol.py b/tools/nni_trial_tool/protocol.py new file mode 100644 index 0000000000..2f1b20e6de --- /dev/null +++ b/tools/nni_trial_tool/protocol.py @@ -0,0 +1,92 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import os +import threading +import time +from datetime import datetime +from enum import Enum + +from .log_utils import LogType, nni_log + +command_path = "./commands" +runner_command_prefix = "runner_command_" +manager_command_prefix = "manager_command_" + + +class CommandType(Enum): + Initialize = b'IN' + RequestTrialJobs = b'GE' + ReportMetricData = b'ME' + UpdateSearchSpace = b'SS' + ImportData = b'FD' + AddCustomizedTrialJob = b'AD' + TrialEnd = b'EN' + Terminate = b'TE' + Ping = b'PI' + + Initialized = b'ID' + NewTrialJob = b'TR' + SendTrialJobParameter = b'SP' + NoMoreTrialJobs = b'NO' + KillTrialJob = b'KI' + + +def send(command, data): + """Send command to Training Service. + command: CommandType object. + data: string payload. + """ + + if not os.path.exists(command_path): + os.makedirs(command_path) + while True: + file_name = os.join(command_path, "%s%s.txt" % ( + runner_command_prefix, int(datetime.now().timestamp * 1000))) + if (os.path.exists(file_name)): + time.sleep(0.01) + continue + with open(file_name, "wb") as out_file: + data = json.dumps(data.encode('utf8')) + msg = b'%b%014d%b' % (command.value, len(data), data) + nni_log(LogType.Info, 'Sending command, data: [%s]' % msg) + out_file.write(msg) + break + + +def receive(): + """Receive a command from Training Service. + Returns a tuple of command (CommandType) and payload (str) + """ + command = None + data = None + + try: + pending_commands = [] + if os.path.exists(command_path): + command_files = os.listdir(command_path) + for item in command_files: + if (item.startswith(manager_command_prefix)): + pending_commands.append(item) + pending_commands.sort() + + if len(pending_commands) > 0: + for command_file in pending_commands: + command_file = os.path.join(command_path, command_file) + with open(command_file, "rb") as _in_file: + header = _in_file.read(16) + nni_log(LogType.Info, 'Received command, header: [%s]' % header) + if header is None or len(header) < 16: + # invalid header + nni_log(LogType.Error, 'incorrect command is found!') + return None, None + length = int(header[2:]) + data = _in_file.read(length) + command = CommandType(header[:2]) + data = json.loads(data.decode('utf8')) + nni_log(LogType.Info, 'Received command, data: [%s]' % data) + os.remove(command_file) + except Exception as identifier: + nni_log(LogType.Error, 'meet unhandled exception: %s' % identifier) + return command, data diff --git a/tools/nni_trial_tool/trial_keeper.py b/tools/nni_trial_tool/trial_keeper.py index 10ee7af211..e5714b006b 100644 --- a/tools/nni_trial_tool/trial_keeper.py +++ b/tools/nni_trial_tool/trial_keeper.py @@ -117,7 +117,7 @@ def main_loop(args): nni_log(LogType.Error, 'HDFS copy directory got exception: ' + str(e)) raise e - ## Exit as the retCode of subprocess(trial) + # Exit as the retCode of subprocess(trial) exit(retCode) break @@ -151,13 +151,13 @@ def check_version(args): nni_manager_version, trial_keeper_version) log_entry['tag'] = 'VCFail' log_entry['msg'] = error_message - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, None), json.dumps(log_entry), 10, False) os._exit(1) else: nni_log(LogType.Info, 'Version match!') log_entry['tag'] = 'VCSuccess' - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, None), json.dumps(log_entry), 10, False) except AttributeError as err: nni_log(LogType.Error, err) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py new file mode 100644 index 0000000000..17d251a7a8 --- /dev/null +++ b/tools/nni_trial_tool/trial_runner.py @@ -0,0 +1,286 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import argparse +import ctypes +import json +import logging +import os +import re +import shlex +import sys +import threading +import time +import traceback +import tarfile +import psutil +from datetime import datetime +from subprocess import Popen + +import pkg_resources + +idle_timeout_seconds = 10 * 60 + +logger = logging.getLogger('trial_runner') +regular = re.compile('v?(?P[0-9](\.[0-9]){0,1}).*') +trial_output_path_name = ".nni" +trial_runner_syslogger = None + + +class Trial: + def __init__(self, args, data): + self.process = None + self.data = data + self.args = args + self.trial_syslogger_stdout = None + + global NNI_TRIAL_JOB_ID + self.id = data["trialId"] + if self.id is None: + raise Exception("trial_id is not found in %s" % data) + os.environ['NNI_TRIAL_JOB_ID'] = self.id + NNI_TRIAL_JOB_ID = self.id + + def run(self): + # redirect trial runner's stdout and stderr to syslog + self.trial_syslogger_stdout = RemoteLogger(self.args.nnimanager_ip, self.args.nnimanager_port, 'trial', StdOutputType.Stdout, + self.args.log_collection, self.id) + + nni_log(LogType.Info, "start to run trial %s" % self.id) + + trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) + + os.environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) + os.environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") + os.environ['NNI_SYS_DIR'] = trial_working_dir + + self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) + os.makedirs(self.trial_output_dir, exist_ok=True) + trial_code_dir = os.path.join(trial_working_dir, "code") + os.makedirs(trial_code_dir, exist_ok=True) + + # prepare code + with tarfile.open(os.path.join("..", "nni-code.tar.gz"), "r:gz") as tar: + tar.extractall(trial_code_dir) + + # save parameters + nni_log(LogType.Info, 'saving parameter %s' % self.data["parameter"]["value"]) + parameter_file_name = os.path.join(trial_working_dir, "parameter.cfg") + with open(parameter_file_name, "w") as parameter_file: + parameter_file.write(self.data["parameter"]["value"]) + + # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior + self.log_pipe_stdout = self.trial_syslogger_stdout.get_pipelog_reader() + self.process = Popen(self.args.trial_command, shell=True, stdout=self.log_pipe_stdout, + stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=os.environ) + nni_log(LogType.Info, 'Trial runner spawns a subprocess (pid {0}) to run command: {1}'. + format(self.process.pid, shlex.split(self.args.trial_command))) + + def is_running(self): + if (self.process is None): + return False + + retCode = self.process.poll() + # child worker process exits and all stdout data is read + if retCode is not None and self.log_pipe_stdout.set_process_exit() and self.log_pipe_stdout.is_read_completed == True: + # In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError. + # So covert it to int32. + retCode = ctypes.c_long(retCode).value + nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode)) + + # Exit as the retCode of subprocess(trial) + exit_code_file_name = os.path.join(self.trial_output_dir, "code") + with open(exit_code_file_name, "w") as exit_file: + exit_file.write("%s %s" % (retCode, int(datetime.now().timestamp() * 1000))) + self.cleanup() + return False + else: + return True + + def kill(self, trial_id=None): + if trial_id == self.id or trial_id is None: + if self.process is not None: + nni_log(LogType.Info, "killing trial %s" % self.id) + for child in psutil.Process(self.process.pid).children(True): + child.kill() + self.process.kill() + self.cleanup() + + def cleanup(self): + nni_log(LogType.Info, "clean up trial %s" % self.id) + self.process = None + if self.log_pipe_stdout is not None: + self.log_pipe_stdout.set_process_exit() + self.log_pipe_stdout = None + if self.trial_syslogger_stdout is not None: + self.trial_syslogger_stdout.close() + self.trial_syslogger_stdout = None + + +def main_loop(args): + '''main loop logic for trial runner''' + idle_last_time = datetime.now() + trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'trial_runner', + StdOutputType.Stdout, args.log_collection, args.runner_id) + sys.stdout = sys.stderr = trial_runner_syslogger + trial = None + + try: + # command loop + while True: + command_type, command_data = receive() + if command_type == CommandType.NewTrialJob: + if trial is not None: + raise Exception('trial %s is running already, cannot start a new one' % trial.trial_id) + trial = Trial(args, command_data) + trial.run() + elif command_type == CommandType.KillTrialJob: + if trial is not None: + trial.kill(command_data) + elif command_type is not None: + raise Exception("unknown command %s" % command_type) + + if trial is not None and trial.is_running(): + idle_last_time = datetime.now() + else: + trial = None + + if (datetime.now() - idle_last_time).seconds > idle_timeout_seconds: + nni_log(LogType.Info, "trial runner is idle more than {0} seconds, so exit.".format( + idle_timeout_seconds)) + break + time.sleep(1) + except Exception as ex: + nni_log(LogType.Error, ex) + finally: + if trial is not None: + trial.kill() + + trial_runner_syslogger.close() + trial_runner_syslogger = None + + +def trial_runner_help_info(*args): + print('please run --help to see guidance') + + +def check_version(args): + try: + trial_runner_version = pkg_resources.get_distribution('nni').version + except pkg_resources.ResolutionError as err: + # package nni does not exist, try nni-tool package + nni_log(LogType.Error, 'Package nni does not exist!') + os._exit(1) + if not args.nni_manager_version: + # skip version check + nni_log(LogType.Warning, 'Skipping version check!') + else: + try: + trial_runner_version = regular.search(trial_runner_version).group('version') + nni_log(LogType.Info, 'trial_runner_version is {0}'.format(trial_runner_version)) + nni_manager_version = regular.search(args.nni_manager_version).group('version') + nni_log(LogType.Info, 'nni_manager_version is {0}'.format(nni_manager_version)) + log_entry = {} + if trial_runner_version != nni_manager_version: + nni_log(LogType.Error, 'Version does not match!') + error_message = 'NNIManager version is {0}, Trial runner version is {1}, NNI version does not match!'.format( + nni_manager_version, trial_runner_version) + log_entry['tag'] = 'VCFail' + log_entry['msg'] = error_message + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_id), json.dumps(log_entry), 10, + False) + os._exit(1) + else: + nni_log(LogType.Info, 'Version match!') + log_entry['tag'] = 'VCSuccess' + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_id), json.dumps(log_entry), 10, + False) + except AttributeError as err: + nni_log(LogType.Error, err) + + +def fetch_parameter_file(args): + class FetchThread(threading.Thread): + def __init__(self, args): + super(FetchThread, self).__init__() + self.args = args + + def run(self): + uri = gen_parameter_meta_url(self.args.nnimanager_ip, self.args.nnimanager_port) + nni_log(LogType.Info, uri) + + while True: + res = rest_get(uri, 10) + nni_log(LogType.Debug, 'status code: {}'.format(res.status_code)) + if res.status_code != 200: + nni_log(LogType.Warning, 'rest response: {}'.format(str(res))) + time.sleep(2) + + fetch_file_thread = FetchThread(args) + fetch_file_thread.start() + + +if __name__ == '__main__': + + '''NNI Trial Runner main function''' + PARSER = argparse.ArgumentParser() + PARSER.set_defaults(func=trial_runner_help_info) + PARSER.add_argument('--trial_command', type=str, help='Command to launch trial process') + PARSER.add_argument('--nnimanager_ip', type=str, help='NNI manager rest server IP') + PARSER.add_argument('--nnimanager_port', type=str, help='NNI manager rest server port') + PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager') + PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trial runner') + args, unknown = PARSER.parse_known_args() + + setting_file = "../settings.json" + if os.path.exists(setting_file): + with open(setting_file, 'r') as fp: + settings = json.load(fp) + print("setting is {}".format(settings)) + + args.exp_id = settings["experimentId"] + args.platform = settings["platform"] + args.runner_id = "runner_"+os.path.basename(os.path.realpath(os.path.curdir)) + + if args.trial_command is None: + args.trial_command = settings["command"] + if args.nnimanager_ip is None: + args.nnimanager_ip = settings["nniManagerIP"] + if args.nnimanager_port is None: + args.nnimanager_port = settings["nniManagerPort"] + if args.nni_manager_version is None: + args.nni_manager_version = settings["nniManagerVersion"] + if args.log_collection is None: + args.log_collection = settings["logCollection"] + + os.environ['NNI_OUTPUT_DIR'] = os.curdir + "/nnioutput" + os.environ['NNI_PLATFORM'] = args.platform + os.environ['NNI_SYS_DIR'] = os.curdir + os.environ['NNI_EXP_ID'] = args.exp_id + os.environ['MULTI_PHASE'] = "true" + os.environ['NNI_TRIAL_JOB_ID'] = "runner" + + from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log + from .rest_utils import rest_get, rest_post + from .url_utils import gen_parameter_meta_url, gen_send_version_url + from .protocol import CommandType, receive + + nni_log(LogType.Info, "merged args is {}".format(args)) + + if args.trial_command is None: + nni_log(LogType.Error, "no command is found.") + os._exit(1) + check_version(args) + try: + main_loop(args) + except SystemExit as se: + nni_log(LogType.Info, 'NNI trial runner exit with code {}'.format(se.code)) + os._exit(se.code) + finally: + if trial_runner_syslogger is not None: + if trial_runner_syslogger.pipeReader is not None: + trial_runner_syslogger.pipeReader.set_process_exit() + trial_runner_syslogger.close() + + # the process doesn't exit even main loop exit. So exit it explictly. + os._exit(0) diff --git a/tools/nni_trial_tool/url_utils.py b/tools/nni_trial_tool/url_utils.py index 1fa926cb8c..8e11ecb9fc 100644 --- a/tools/nni_trial_tool/url_utils.py +++ b/tools/nni_trial_tool/url_utils.py @@ -3,13 +3,20 @@ from .constants import API_ROOT_URL, BASE_URL, STDOUT_API, NNI_TRIAL_JOB_ID, NNI_EXP_ID, VERSION_API, PARAMETER_META_API -def gen_send_stdout_url(ip, port): + +def gen_send_stdout_url(ip, port, trial_id): '''Generate send stdout url''' - return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, STDOUT_API, NNI_EXP_ID, NNI_TRIAL_JOB_ID) + if trial_id is None: + trial_id = NNI_TRIAL_JOB_ID + return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, STDOUT_API, NNI_EXP_ID, trial_id) + -def gen_send_version_url(ip, port): +def gen_send_version_url(ip, port, trial_id): '''Generate send error url''' - return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, VERSION_API, NNI_EXP_ID, NNI_TRIAL_JOB_ID) + if trial_id is None: + trial_id = NNI_TRIAL_JOB_ID + return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, VERSION_API, NNI_EXP_ID, trial_id) + def gen_parameter_meta_url(ip, port): '''Generate send error url''' From 2aafac1bf2dfd36a81ec4bf178757d5c6a1a734c Mon Sep 17 00:00:00 2001 From: Chi Song <27178119+squirrelsc@users.noreply.github.com> Date: Wed, 3 Jun 2020 10:59:32 +0800 Subject: [PATCH 05/98] refactors 1. rename storage file name 2. add more log on status changes 3. change isEnd to isAlive for better naming --- .../pai/reusable/environment.ts | 4 ++-- .../pai/reusable/environmentManager.ts | 21 +++++++++++-------- .../pai/reusable/mountedStorageService.ts | 2 +- .../pai/reusable/openPaiEnvironmentService.ts | 9 +++++--- .../pai/reusable/reusableTrainingService.ts | 2 +- .../{storage.ts => storageService.ts} | 0 6 files changed, 22 insertions(+), 16 deletions(-) rename src/nni_manager/training_service/pai/reusable/{storage.ts => storageService.ts} (100%) diff --git a/src/nni_manager/training_service/pai/reusable/environment.ts b/src/nni_manager/training_service/pai/reusable/environment.ts index 7b2c485cef..0197870fff 100644 --- a/src/nni_manager/training_service/pai/reusable/environment.ts +++ b/src/nni_manager/training_service/pai/reusable/environment.ts @@ -20,7 +20,7 @@ 'use strict'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../../common/trainingService"; -import { StorageService } from "./storage"; +import { StorageService } from "./storageService"; import * as component from '../../../common/component'; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; @@ -81,7 +81,7 @@ export class EnvironmentInformation { // training platform job friendly name, in case it's different with job ID. public jobName: string; public isIdle: boolean = false; - public isEnd: boolean = false; + public isAlive: boolean = true; public trackingUrl: string = ""; public status: EnvironmentStatus = "UNKNOWN"; public workingFolder: string = ""; diff --git a/src/nni_manager/training_service/pai/reusable/environmentManager.ts b/src/nni_manager/training_service/pai/reusable/environmentManager.ts index 5c284ea4c6..53018922cb 100644 --- a/src/nni_manager/training_service/pai/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/pai/reusable/environmentManager.ts @@ -34,7 +34,7 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { validateCodeDir } from '../../common/util'; import { EnvironmentInformation, EnvironmentService, RunnerSettings, TrialDetail } from './environment'; import { JobRestServer } from './jobRestServer'; -import { StorageService } from './storage'; +import { StorageService } from './storageService'; /** * It uses to manage jobs on training platforms @@ -218,7 +218,7 @@ class EnvironmentManager implements TrainingService { const environments = [...this.environments.values()]; for (let index = 0; index < environments.length; index++) { const environment = environments[index]; - if (environment.isEnd === false) { + if (environment.isAlive === true) { this.log.info(`stopping environment ${environment.id}...`); await environmentService.stopEnvironment(environment); this.log.info(`stopped environment ${environment.id}.`); @@ -265,24 +265,27 @@ class EnvironmentManager implements TrainingService { while (!this.stopping) { const environments: EnvironmentInformation[] = []; this.environments.forEach((environment) => { - if (environment.isEnd === false) { + if (environment.isAlive === true) { environments.push(environment); } }); environmentService.updateEnvironmentsStatus(environments); environments.forEach((environment) => { + const oldIsAlive = environment.isAlive; switch (environment.status) { case 'WAITING': case 'RUNNING': case 'UNKNOWN': - environment.isEnd = false; + environment.isAlive = true; break; default: - this.log.debug(`set environment ${environment.jobId} ${environment.status} to ended`); - environment.isEnd = true; + environment.isAlive = false; break; } + if (oldIsAlive !== environment.isAlive) { + this.log.debug(`set environment isAlive from ${oldIsAlive} to ${environment.status} due to status is ${environment.status}.`); + } }); await delay(5000); } @@ -350,7 +353,7 @@ class EnvironmentManager implements TrainingService { let liveEnvironmentsCount = 0; const idleEnvironments: EnvironmentInformation[] = []; this.environments.forEach((environment) => { - if (!environment.isEnd) { + if (environment.isAlive === true) { liveEnvironmentsCount++; if (environment.status === "RUNNING" && environment.isIdle) { idleEnvironments.push(environment); @@ -401,11 +404,11 @@ class EnvironmentManager implements TrainingService { if (environment.status === "FAILED") { environment.isIdle = false; - environment.isEnd = true; + environment.isAlive = false; throw new Error(`error on request environment ${environment.jobId}, please check log for more details.`); } else { environment.isIdle = true; - environment.isEnd = false; + environment.isAlive = true; } this.log.info(`requested environment ${environment.id} and job id is ${environment.jobId}.`); } diff --git a/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts b/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts index 5dd0c380fa..5cd143bed2 100644 --- a/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts +++ b/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts @@ -20,7 +20,7 @@ import * as fs from 'fs'; import * as path from 'path'; import { Deferred } from "ts-deferred"; -import { StorageService } from "./storage"; +import { StorageService } from "./storageService"; export class MountedStorageService extends StorageService { diff --git a/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts index 5422cdef35..b088f33b83 100644 --- a/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts @@ -29,7 +29,7 @@ import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../paiConfig'; import { NNIPAIK8STrialConfig } from '../paiK8S/paiK8SConfig'; import { EnvironmentInformation, EnvironmentService } from './environment'; -import { StorageService } from './storage'; +import { StorageService } from './storageService'; const yaml = require('js-yaml'); @@ -90,6 +90,7 @@ export class OpenPaiEnvironmentService implements EnvironmentService { if (jobInfos.has(environment.jobId)) { const jobResponse = jobInfos.get(environment.jobId); if (jobResponse && jobResponse.state) { + const oldEnvironmentStatus = environment.status; switch (jobResponse.state) { case 'WAITING': case 'RUNNING': @@ -105,6 +106,9 @@ export class OpenPaiEnvironmentService implements EnvironmentService { this.log.error(`OpenPAI: job ${environment.jobId} returns unknown state ${jobResponse.state}.`); environment.status = 'UNKNOWN'; } + if (oldEnvironmentStatus !== environment.status) { + this.log.debug(`OpenPAI: job ${environment.jobId} change status ${oldEnvironmentStatus} to ${environment.status} due to job is ${jobResponse.state}.`) + } } else { this.log.error(`OpenPAI: job ${environment.jobId} has no state returned. body:${JSON.stringify(jobResponse)}`); // some error happens, and mark this environment @@ -200,7 +204,7 @@ export class OpenPaiEnvironmentService implements EnvironmentService { deferred.reject((error !== undefined && error !== null) ? error : `Stop trial failed, http code: ${response.statusCode}`); } else { - this.log.info(`OpenPAI job ${environment.jobId} stopped, body: ${response.body}.`); + this.log.info(`OpenPAI job ${environment.jobId} stopped.`); } deferred.resolve(); } catch (error) { @@ -249,7 +253,6 @@ export class OpenPaiEnvironmentService implements EnvironmentService { case TrialConfigMetadataKey.MULTI_PHASE: break; default: - //Reject for unknown keys this.log.debug(`OpenPAI not proccessed metadata key: '${key}', value: '${value}'`); } } diff --git a/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts b/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts index 8668fa4b1d..de3fc979d7 100644 --- a/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts +++ b/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts @@ -30,7 +30,7 @@ import { EnvironmentManager } from './environmentManager'; import { Container, Scope } from 'typescript-ioc'; import { EnvironmentService } from './environment'; import { OpenPaiEnvironmentService } from './openPaiEnvironmentService'; -import { StorageService } from './storage'; +import { StorageService } from './storageService'; import { MountedStorageService } from './mountedStorageService'; diff --git a/src/nni_manager/training_service/pai/reusable/storage.ts b/src/nni_manager/training_service/pai/reusable/storageService.ts similarity index 100% rename from src/nni_manager/training_service/pai/reusable/storage.ts rename to src/nni_manager/training_service/pai/reusable/storageService.ts From 0435b7f17edecc3305cd1c739be148ee8d7dc36a Mon Sep 17 00:00:00 2001 From: Chi Song <27178119+squirrelsc@users.noreply.github.com> Date: Wed, 3 Jun 2020 14:11:38 +0800 Subject: [PATCH 06/98] refactoring add internal prefix for internal storage methods for clear usage. fix pylint errors minor fixes --- src/nni_manager/main.ts | 2 +- .../pai/reusable/environmentManager.ts | 2 +- .../pai/reusable/mountedStorageService.ts | 49 +++++++------ .../pai/reusable/storageService.ts | 72 +++++++++---------- tools/nni_trial_tool/protocol.py | 4 +- tools/nni_trial_tool/trial_runner.py | 11 ++- 6 files changed, 69 insertions(+), 71 deletions(-) diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 3b0d1bb07c..da6a68441f 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -205,7 +205,7 @@ process.on(getStopSignal(), async () => { hasError = true; log.error(`${err.stack}`); } finally { - // await log.close(); + await log.close(); process.exit(hasError ? 1 : 0); } }); diff --git a/src/nni_manager/training_service/pai/reusable/environmentManager.ts b/src/nni_manager/training_service/pai/reusable/environmentManager.ts index 53018922cb..d40183fdc9 100644 --- a/src/nni_manager/training_service/pai/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/pai/reusable/environmentManager.ts @@ -284,7 +284,7 @@ class EnvironmentManager implements TrainingService { break; } if (oldIsAlive !== environment.isAlive) { - this.log.debug(`set environment isAlive from ${oldIsAlive} to ${environment.status} due to status is ${environment.status}.`); + this.log.debug(`set environment isAlive from ${oldIsAlive} to ${environment.isAlive} due to status is ${environment.status}.`); } }); await delay(5000); diff --git a/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts b/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts index 5cd143bed2..19e600ff70 100644 --- a/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts +++ b/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts @@ -24,17 +24,17 @@ import { StorageService } from "./storageService"; export class MountedStorageService extends StorageService { - protected config(_key: string, _value: string): void { + protected internalConfig(_key: string, _value: string): void { // nothing to config } - protected async remove(path: string, isDirectory: boolean, isRecursive: boolean): Promise { + protected async internalRemove(path: string, isDirectory: boolean, isRecursive: boolean): Promise { if (isDirectory) { if (isRecursive) { const children = await fs.promises.readdir(path); for (const file of children) { const stat = await fs.promises.lstat(file); - this.remove(file, stat.isDirectory(), isRecursive); + this.internalRemove(file, stat.isDirectory(), isRecursive); } } else { await fs.promises.rmdir(path); @@ -44,52 +44,51 @@ export class MountedStorageService extends StorageService { } } - protected async rename(remotePath: string, newName: string): Promise { + protected async internalRename(remotePath: string, newName: string): Promise { const dirName = path.dirname(remotePath); - newName = this.joinPath(dirName, newName); + newName = this.internalJoin(dirName, newName); await fs.promises.rename(remotePath, newName); } - protected async mkdir(remotePath: string): Promise { + protected async internalMkdir(remotePath: string): Promise { if (!fs.existsSync(remotePath)) { await fs.promises.mkdir(remotePath, { recursive: true }); } } - protected async copy(localPath: string, remotePath: string, isDirectory: boolean, isToRemote: boolean): Promise { - if (localPath === remotePath) { - return remotePath; + protected async internalCopy(sourcePath: string, targetPath: string, isDirectory: boolean, isFromRemote: boolean = false, isToRemote: boolean = true): Promise { + if (sourcePath === targetPath) { + return targetPath; } - const sourcePath = isToRemote ? localPath : remotePath; - let targetPath = isToRemote ? remotePath : localPath; - this.logger.debug(`copying ${sourcePath} to ${targetPath}, dir ${isDirectory}, isRemote: ${isToRemote}`); + this.logger.debug(`copying ${sourcePath} to ${targetPath}, dir ${isDirectory}, isFromRemote ${isFromRemote}, isToRemote: ${isToRemote}`); if (isDirectory) { + const basename = isFromRemote ? this.internalBasename(sourcePath) : path.basename(sourcePath); if (isToRemote) { - targetPath = this.joinPath(targetPath, this.basename(localPath)); + targetPath = this.internalJoin(targetPath, basename); + await this.internalMkdir(targetPath); } else { - targetPath = path.join(targetPath, this.basename(remotePath)) + targetPath = path.join(targetPath, basename); + await fs.promises.mkdir(targetPath); } - await this.mkdir(targetPath); const children = await fs.promises.readdir(sourcePath); for (const child of children) { - const childSourcePath = this.joinPath(sourcePath, child); + const childSourcePath = this.internalJoin(sourcePath, child); const stat = await fs.promises.lstat(childSourcePath); - // true: the source and target is aligned already, so always set isToRemote to true. - this.copy(childSourcePath, targetPath, stat.isDirectory(), true); + await this.internalCopy(childSourcePath, targetPath, stat.isDirectory(), isFromRemote, isToRemote); } return targetPath; } else { // This behavior may not be consistent for each platform, but it needs to correct to same - await this.mkdir(targetPath); + await this.internalMkdir(targetPath); const targetFileName = path.join(targetPath, path.basename(sourcePath)); await fs.promises.copyFile(sourcePath, targetFileName); return targetFileName; } } - protected async exists(remotePath: string): Promise { + protected async internalExists(remotePath: string): Promise { const deferred = new Deferred(); fs.exists(remotePath, (exists) => { deferred.resolve(exists); @@ -97,7 +96,7 @@ export class MountedStorageService extends StorageService { return deferred.promise; } - protected async read(remotePath: string, offset?: number, length?: number): Promise { + protected async internalRead(remotePath: string, offset?: number, length?: number): Promise { const deferred = new Deferred(); // set a max length to 1MB for performance concern. const maxLength = 1024 * 1024; @@ -129,19 +128,19 @@ export class MountedStorageService extends StorageService { } - protected isRelativePath(remotePath: string): boolean { + protected internalIsRelativePath(remotePath: string): boolean { return !path.isAbsolute(remotePath); } - protected joinPath(...paths: string[]): string { + protected internalJoin(...paths: string[]): string { return path.join(...paths); } - protected dirname(remotePath: string): string { + protected internalDirname(remotePath: string): string { return path.dirname(remotePath); } - protected basename(remotePath: string): string { + protected internalBasename(remotePath: string): string { return path.basename(remotePath); } } diff --git a/src/nni_manager/training_service/pai/reusable/storageService.ts b/src/nni_manager/training_service/pai/reusable/storageService.ts index 35071085ee..0bb9a5b59a 100644 --- a/src/nni_manager/training_service/pai/reusable/storageService.ts +++ b/src/nni_manager/training_service/pai/reusable/storageService.ts @@ -31,17 +31,17 @@ export abstract class StorageService { protected remoteRoot: string = ""; protected logger: Logger; - protected abstract config(key: string, value: string): void; - protected abstract async remove(remotePath: string, isDirectory: boolean, isRecursive: boolean): Promise; - protected abstract async rename(remotePath: string, newName: string): Promise; - protected abstract async mkdir(remotePath: string): Promise; - protected abstract async copy(localPath: string, remotePath: string, isDirectory: boolean, isToRemote: boolean): Promise; - protected abstract async exists(remotePath: string): Promise; - protected abstract async read(remotePath: string, offset: number, length: number): Promise; - protected abstract isRelativePath(path: string): boolean; - protected abstract joinPath(...paths: string[]): string; - protected abstract dirname(...paths: string[]): string; - protected abstract basename(...paths: string[]): string; + protected abstract internalConfig(key: string, value: string): void; + protected abstract async internalRemove(remotePath: string, isDirectory: boolean, isRecursive: boolean): Promise; + protected abstract async internalRename(remotePath: string, newName: string): Promise; + protected abstract async internalMkdir(remotePath: string): Promise; + protected abstract async internalCopy(localPath: string, remotePath: string, isDirectory: boolean, isFromRemote: boolean, isToRemote: boolean): Promise; + protected abstract async internalExists(remotePath: string): Promise; + protected abstract async internalRead(remotePath: string, offset: number, length: number): Promise; + protected abstract internalIsRelativePath(path: string): boolean; + protected abstract internalJoin(...paths: string[]): string; + protected abstract internalDirname(...paths: string[]): string; + protected abstract internalBasename(...paths: string[]): string; constructor() { this.logger = getLogger(); @@ -56,13 +56,13 @@ export abstract class StorageService { public async renameRemote(remotePath: string, newName: string): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`rename remotePath: ${remotePath} to: ${newName}`); - await this.rename(remotePath, newName); + await this.internalRename(remotePath, newName); } public async createDirectory(remotePath: string): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`create remotePath: ${remotePath}`); - await this.mkdir(remotePath); + await this.internalMkdir(remotePath); } public async copyDirectory(localPath: string, remotePath: string, asGzip: boolean = false): Promise { @@ -70,7 +70,7 @@ export abstract class StorageService { remotePath = this.expandPath(true, remotePath); this.logger.debug(`copy localPath: ${localPath} to remotePath: ${remotePath}, asGzip ${asGzip}`); if (!await this.existsRemote(remotePath)) { - await this.mkdir(remotePath); + await this.internalMkdir(remotePath); } if (asGzip) { @@ -79,15 +79,15 @@ export abstract class StorageService { const tarFileName = `${localPathBaseName}.tar.gz`; const localTarPath: string = path.join(os.tmpdir(), tempTarFileName); await tarAdd(localTarPath, localPath); - await this.copy(localTarPath, remotePath, false, true); - const remoteFileName = this.joinPath(remotePath, tempTarFileName); - await this.rename(remoteFileName, tarFileName); + await this.internalCopy(localTarPath, remotePath, false, false, true); + const remoteFileName = this.internalJoin(remotePath, tempTarFileName); + await this.internalRename(remoteFileName, tarFileName); await fs.promises.unlink(localTarPath); - remotePath = this.joinPath(remotePath, tarFileName); + remotePath = this.internalJoin(remotePath, tarFileName); } else { - await this.copy(localPath, remotePath, true, true); - remotePath = this.joinPath(remotePath, path.basename(localPath)); + await this.internalCopy(localPath, remotePath, true, false, true); + remotePath = this.internalJoin(remotePath, path.basename(localPath)); } return remotePath; @@ -97,44 +97,44 @@ export abstract class StorageService { localPath = this.expandPath(false, localPath); remotePath = this.expandPath(true, remotePath); this.logger.debug(`copy remotePath: ${remotePath} to localPath: ${localPath}`); - return await this.copy(localPath, remotePath, true, false); + return await this.internalCopy(localPath, remotePath, true, true, false); } public async removeDirectory(remotePath: string, isRecursive: boolean): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`remove remotePath: ${remotePath}`); - await this.remove(remotePath, true, isRecursive); + await this.internalRemove(remotePath, true, isRecursive); } public async readRemoteFile(remotePath: string, offset: number = -1, length: number = -1): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`read remote file: ${remotePath}, offset: ${offset}, length: ${length}`); - return this.read(remotePath, offset, length); + return this.internalRead(remotePath, offset, length); } public async existsRemote(remotePath: string): Promise { remotePath = this.expandPath(true, remotePath); - const exists = await this.exists(remotePath); + const exists = await this.internalExists(remotePath); this.logger.debug(`check exists remotePath: ${remotePath} is ${exists}`); return exists } public async save(content: string, remotePath: string): Promise { this.logger.debug(`save content to remotePath: ${remotePath}, length: ${content.length}`); - const fileName = this.basename(remotePath); + const fileName = this.internalBasename(remotePath); const tempFileName = `temp_${uniqueString(4)}_${fileName}`; remotePath = this.expandPath(true, remotePath); const localTempFileName = path.join(os.tmpdir(), tempFileName); - const remoteDir = this.dirname(remotePath); - const remoteTempFile = this.joinPath(remoteDir, tempFileName); + const remoteDir = this.internalDirname(remotePath); + const remoteTempFile = this.internalJoin(remoteDir, tempFileName); - if (await this.exists(remotePath) === true) { - await this.remove(remotePath, false, false); + if (await this.internalExists(remotePath) === true) { + await this.internalRemove(remotePath, false, false); } await fs.promises.writeFile(localTempFileName, content); - await this.copy(localTempFileName, remoteDir, false, true); + await this.internalCopy(localTempFileName, remoteDir, false, false, true); await this.renameRemote(remoteTempFile, fileName); await fs.promises.unlink(localTempFileName); } @@ -143,26 +143,26 @@ export abstract class StorageService { localPath = this.expandPath(false, localPath); remotePath = this.expandPath(true, remotePath); this.logger.debug(`copy file localPath: ${localPath} to remotePath: ${remotePath}`); - await this.copy(localPath, remotePath, false, true); + await this.internalCopy(localPath, remotePath, false, false, true); } public async copyFileBack(remotePath: string, localPath: string): Promise { localPath = this.expandPath(false, localPath); remotePath = this.expandPath(true, remotePath); this.logger.debug(`copy file remotePath: ${remotePath} to localPath: ${localPath}`); - await this.copy(localPath, remotePath, false, false); + await this.internalCopy(localPath, remotePath, false, true, false); } public async removeFile(remotePath: string): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`remove file remotePath: ${remotePath}`); - await this.remove(remotePath, false, false); + await this.internalRemove(remotePath, false, false); } public joinRemotePath(...paths: string[]): string { - let fullPath = this.joinPath(...paths); - if (this.isRelativePath(fullPath) === true && this.remoteRoot !== "") { - fullPath = this.joinPath(this.remoteRoot, fullPath); + let fullPath = this.internalJoin(...paths); + if (this.internalIsRelativePath(fullPath) === true && this.remoteRoot !== "") { + fullPath = this.internalJoin(this.remoteRoot, fullPath); } return fullPath; } diff --git a/tools/nni_trial_tool/protocol.py b/tools/nni_trial_tool/protocol.py index 2f1b20e6de..f6863d198c 100644 --- a/tools/nni_trial_tool/protocol.py +++ b/tools/nni_trial_tool/protocol.py @@ -3,7 +3,6 @@ import json import os -import threading import time from datetime import datetime from enum import Enum @@ -48,7 +47,8 @@ def send(command, data): time.sleep(0.01) continue with open(file_name, "wb") as out_file: - data = json.dumps(data.encode('utf8')) + data = json.dumps(data) + data = data.encode('utf8') msg = b'%b%014d%b' % (command.value, len(data), data) nni_log(LogType.Info, 'Sending command, data: [%s]' % msg) out_file.write(msg) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 17d251a7a8..c77f63c16b 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -9,19 +9,18 @@ import re import shlex import sys +import tarfile import threading import time -import traceback -import tarfile -import psutil from datetime import datetime from subprocess import Popen import pkg_resources +import psutil idle_timeout_seconds = 10 * 60 -logger = logging.getLogger('trial_runner') +logger = logging.getLogger('runner') regular = re.compile('v?(?P[0-9](\.[0-9]){0,1}).*') trial_output_path_name = ".nni" trial_runner_syslogger = None @@ -120,7 +119,7 @@ def cleanup(self): def main_loop(args): '''main loop logic for trial runner''' idle_last_time = datetime.now() - trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'trial_runner', + trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'runner', StdOutputType.Stdout, args.log_collection, args.runner_id) sys.stdout = sys.stderr = trial_runner_syslogger trial = None @@ -177,7 +176,7 @@ def check_version(args): else: try: trial_runner_version = regular.search(trial_runner_version).group('version') - nni_log(LogType.Info, 'trial_runner_version is {0}'.format(trial_runner_version)) + nni_log(LogType.Info, 'runner_version is {0}'.format(trial_runner_version)) nni_manager_version = regular.search(args.nni_manager_version).group('version') nni_log(LogType.Info, 'nni_manager_version is {0}'.format(nni_manager_version)) log_entry = {} From 2e5ef51042fc9fc5f35cd342a51f06b9acef61bf Mon Sep 17 00:00:00 2001 From: Chi Song <27178119+squirrelsc@users.noreply.github.com> Date: Wed, 3 Jun 2020 16:30:53 +0800 Subject: [PATCH 07/98] minor fix, and take some review comments. --- src/nni_manager/main.ts | 4 ++-- .../pai/reusable/environment.ts | 10 +++++--- .../pai/reusable/environmentManager.ts | 14 +++++------ ...ngService.ts => forwardTrainingService.ts} | 4 ++-- .../pai/reusable/openPaiEnvironmentService.ts | 23 ++++++++++++------- .../pai/reusable/storageService.ts | 6 ++--- 6 files changed, 35 insertions(+), 26 deletions(-) rename src/nni_manager/training_service/pai/reusable/{reusableTrainingService.ts => forwardTrainingService.ts} (98%) diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index da6a68441f..5a4d0a93fa 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -21,7 +21,7 @@ import { NNIRestServer } from './rest_server/nniRestServer'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService'; -import { ReusableTrainingService } from './training_service/pai/reusable/reusableTrainingService'; +import { ForwardTrainingService } from './training_service/pai/reusable/forwardTrainingService'; import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService'; import { RemoteMachineTrainingService @@ -47,7 +47,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN .scope(Scope.Singleton); } else if (platformMode === 'pai') { Container.bind(TrainingService) - .to(ReusableTrainingService) + .to(ForwardTrainingService) .scope(Scope.Singleton); } else if (platformMode === 'paiYarn') { Container.bind(TrainingService) diff --git a/src/nni_manager/training_service/pai/reusable/environment.ts b/src/nni_manager/training_service/pai/reusable/environment.ts index 0197870fff..a02fea544e 100644 --- a/src/nni_manager/training_service/pai/reusable/environment.ts +++ b/src/nni_manager/training_service/pai/reusable/environment.ts @@ -80,15 +80,19 @@ export class EnvironmentInformation { public jobId: string; // training platform job friendly name, in case it's different with job ID. public jobName: string; + + // key states + // true: environment is ready to run trial. public isIdle: boolean = false; + // true: environment is running, waiting, or unknown. public isAlive: boolean = true; - public trackingUrl: string = ""; public status: EnvironmentStatus = "UNKNOWN"; + + public trackingUrl: string = ""; public workingFolder: string = ""; - public envWorkingFolder: string = ""; + public runnerWorkingFolder: string = ""; public command: string = ""; public serverCount: number = 1; - public currentTrialId: string = ""; constructor(id: string, jobName: string, jobId?: string) { this.id = id; diff --git a/src/nni_manager/training_service/pai/reusable/environmentManager.ts b/src/nni_manager/training_service/pai/reusable/environmentManager.ts index d40183fdc9..c4b83fe1f0 100644 --- a/src/nni_manager/training_service/pai/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/pai/reusable/environmentManager.ts @@ -125,11 +125,10 @@ class EnvironmentManager implements TrainingService { { const environment = trial.environment; if (environment) { + await this.sendCommand(KILL_TRIAL_JOB, trialJobId, environment); trial.isEarlyStopped = isEarlyStopped; trial.status = trial.isEarlyStopped === true ? 'EARLY_STOPPED' : 'USER_CANCELED'; - - await this.sendCommand(KILL_TRIAL_JOB, trialJobId, environment); this.releaseEnvironment(trial); } } @@ -362,11 +361,10 @@ class EnvironmentManager implements TrainingService { }); while (idleEnvironments.length > 0 && waitingTrials.length > 0) { - for (const trial of waitingTrials) { - const idleEnvironment = idleEnvironments.pop(); - if (idleEnvironment) { - await this.assignEnvironment(trial, idleEnvironment); - } + const trial = waitingTrials.shift(); + const idleEnvironment = idleEnvironments.shift(); + if (trial !== undefined && idleEnvironment != undefined) { + await this.assignEnvironment(trial, idleEnvironment); } } @@ -415,7 +413,7 @@ class EnvironmentManager implements TrainingService { private async assignEnvironment(trial: TrialDetail, environment: EnvironmentInformation): Promise { if (trial.environment) { - throw new Error(`trial ${trial.id} has assigned environment ${environment.id} already!`); + throw new Error(`trial ${trial.id} has assigned environment ${trial.environment.id} already, not assign to ${environment.id}!`); } if (environment.isIdle == false) { throw new Error(`environment ${environment.id} is not idle, and cannot be assigned again!`); diff --git a/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts b/src/nni_manager/training_service/pai/reusable/forwardTrainingService.ts similarity index 98% rename from src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts rename to src/nni_manager/training_service/pai/reusable/forwardTrainingService.ts index de3fc979d7..ccc58c4aed 100644 --- a/src/nni_manager/training_service/pai/reusable/reusableTrainingService.ts +++ b/src/nni_manager/training_service/pai/reusable/forwardTrainingService.ts @@ -39,7 +39,7 @@ import { MountedStorageService } from './mountedStorageService'; * The final goal is to support reusable training job in higher level than training service. */ @component.Singleton -class ReusableTrainingService implements TrainingService { +class ForwardTrainingService implements TrainingService { protected readonly log!: Logger; private internalTrainingService: TrainingService | undefined; private metaDataCache: Map = new Map(); @@ -170,4 +170,4 @@ class ReusableTrainingService implements TrainingService { } } -export { ReusableTrainingService }; +export { ForwardTrainingService }; diff --git a/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts index b088f33b83..772d118e5c 100644 --- a/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts @@ -141,14 +141,15 @@ export class OpenPaiEnvironmentService implements EnvironmentService { } // Step 1. Prepare PAI job configuration - environment.envWorkingFolder = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/envs/${environment.id}`; - environment.command = `cd ${environment.envWorkingFolder} && ${environment.command}` + environment.runnerWorkingFolder = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/envs/${environment.id}`; + environment.command = `cd ${environment.runnerWorkingFolder} && ${environment.command}` environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.jobId}` - // Generate Job Configuration in yaml format + // Step 2. Generate Job Configuration in yaml format const paiJobConfig = this.generateJobConfigInYamlFormat(environment); this.log.debug(`generated paiJobConfig: ${paiJobConfig}`); - // Step 2. Submit PAI job via Rest call + + // Step 3. Submit PAI job via Rest call const submitJobRequest: request.Options = { uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, method: 'POST', @@ -281,14 +282,20 @@ export class OpenPaiEnvironmentService implements EnvironmentService { nniJobConfig = JSON.parse(JSON.stringify(this.paiJobConfig)); //Trick for deep clone in Typescript nniJobConfig.name = jobName; if (nniJobConfig.taskRoles) { - environment.serverCount = nniJobConfig.taskRoles.length; // Each taskRole will generate new command in NNI's command format // Each command will be formatted to NNI style for (const taskRoleIndex in nniJobConfig.taskRoles) { - const commands = nniJobConfig.taskRoles[taskRoleIndex].commands - const nniTrialCommand = `${environment.command} ${commands.join(" && ").replace(/(["'$`\\])/g, '\\$1')}`; - nniJobConfig.taskRoles[taskRoleIndex].commands = [nniTrialCommand] + const taskRole = nniJobConfig.taskRoles[taskRoleIndex]; + let instanceCount = 1; + if (taskRole.instances) { + instanceCount = taskRole.instances; + } + + environment.serverCount += instanceCount; + + const nniTrialCommand = `${environment.command} ${taskRole.commands.join(" && ").replace(/(["'$`\\])/g, '\\$1')}`; + taskRole.commands = [nniTrialCommand]; } } diff --git a/src/nni_manager/training_service/pai/reusable/storageService.ts b/src/nni_manager/training_service/pai/reusable/storageService.ts index 0bb9a5b59a..f4ed872054 100644 --- a/src/nni_manager/training_service/pai/reusable/storageService.ts +++ b/src/nni_manager/training_service/pai/reusable/storageService.ts @@ -35,7 +35,7 @@ export abstract class StorageService { protected abstract async internalRemove(remotePath: string, isDirectory: boolean, isRecursive: boolean): Promise; protected abstract async internalRename(remotePath: string, newName: string): Promise; protected abstract async internalMkdir(remotePath: string): Promise; - protected abstract async internalCopy(localPath: string, remotePath: string, isDirectory: boolean, isFromRemote: boolean, isToRemote: boolean): Promise; + protected abstract async internalCopy(sourcePath: string, targetPath: string, isDirectory: boolean, isFromRemote: boolean, isToRemote: boolean): Promise; protected abstract async internalExists(remotePath: string): Promise; protected abstract async internalRead(remotePath: string, offset: number, length: number): Promise; protected abstract internalIsRelativePath(path: string): boolean; @@ -120,11 +120,11 @@ export abstract class StorageService { } public async save(content: string, remotePath: string): Promise { - this.logger.debug(`save content to remotePath: ${remotePath}, length: ${content.length}`); + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`saving content to remotePath: ${remotePath}, length: ${content.length}`); const fileName = this.internalBasename(remotePath); const tempFileName = `temp_${uniqueString(4)}_${fileName}`; - remotePath = this.expandPath(true, remotePath); const localTempFileName = path.join(os.tmpdir(), tempFileName); const remoteDir = this.internalDirname(remotePath); From 6d7bc62dcde4e5e3f147589499bb04eef90f80d8 Mon Sep 17 00:00:00 2001 From: Chi Song <27178119+squirrelsc@users.noreply.github.com> Date: Wed, 3 Jun 2020 16:38:16 +0800 Subject: [PATCH 08/98] move reuse to upper level --- src/nni_manager/main.ts | 2 +- .../{pai => }/reusable/environment.ts | 4 ++-- .../{pai => }/reusable/environmentManager.ts | 22 +++++++++---------- .../reusable/forwardTrainingService.ts | 14 ++++++------ .../{pai => }/reusable/jobRestServer.ts | 2 +- .../reusable/mountedStorageService.ts | 0 .../reusable/openPaiEnvironmentService.ts | 12 +++++----- .../{pai => }/reusable/storageService.ts | 6 ++--- 8 files changed, 31 insertions(+), 31 deletions(-) rename src/nni_manager/training_service/{pai => }/reusable/environment.ts (97%) rename src/nni_manager/training_service/{pai => }/reusable/environmentManager.ts (96%) rename src/nni_manager/training_service/{pai => }/reusable/forwardTrainingService.ts (94%) rename src/nni_manager/training_service/{pai => }/reusable/jobRestServer.ts (97%) rename src/nni_manager/training_service/{pai => }/reusable/mountedStorageService.ts (100%) rename src/nni_manager/training_service/{pai => }/reusable/openPaiEnvironmentService.ts (98%) rename src/nni_manager/training_service/{pai => }/reusable/storageService.ts (98%) diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 5a4d0a93fa..d626685a18 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -21,7 +21,7 @@ import { NNIRestServer } from './rest_server/nniRestServer'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService'; -import { ForwardTrainingService } from './training_service/pai/reusable/forwardTrainingService'; +import { ForwardTrainingService } from './training_service/reusable/forwardTrainingService'; import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService'; import { RemoteMachineTrainingService diff --git a/src/nni_manager/training_service/pai/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts similarity index 97% rename from src/nni_manager/training_service/pai/reusable/environment.ts rename to src/nni_manager/training_service/reusable/environment.ts index a02fea544e..2c4c7b8867 100644 --- a/src/nni_manager/training_service/pai/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -19,9 +19,9 @@ 'use strict'; -import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../../common/trainingService"; +import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../common/trainingService"; import { StorageService } from "./storageService"; -import * as component from '../../../common/component'; +import * as component from '../../common/component'; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; diff --git a/src/nni_manager/training_service/pai/reusable/environmentManager.ts b/src/nni_manager/training_service/reusable/environmentManager.ts similarity index 96% rename from src/nni_manager/training_service/pai/reusable/environmentManager.ts rename to src/nni_manager/training_service/reusable/environmentManager.ts index c4b83fe1f0..5077fe1df3 100644 --- a/src/nni_manager/training_service/pai/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/reusable/environmentManager.ts @@ -21,17 +21,17 @@ import { EventEmitter } from 'events'; import * as path from 'path'; -import * as component from '../../../common/component'; -import { getExperimentId, getPlatform } from '../../../common/experimentStartupInfo'; -import { getLogger, Logger } from '../../../common/log'; -import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric } from '../../../common/trainingService'; -import { delay, generateParamFileName, getVersion, uniqueString } from '../../../common/utils'; -import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../../core/commands'; -import { encodeCommand } from '../../../core/ipcInterface'; -import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../../common/containerJobData'; -import { TrialConfig } from '../../common/trialConfig'; -import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; -import { validateCodeDir } from '../../common/util'; +import * as component from '../../common/component'; +import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../common/log'; +import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric } from '../../common/trainingService'; +import { delay, generateParamFileName, getVersion, uniqueString } from '../../common/utils'; +import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../core/commands'; +import { encodeCommand } from '../../core/ipcInterface'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; +import { TrialConfig } from '../common/trialConfig'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { validateCodeDir } from '../common/util'; import { EnvironmentInformation, EnvironmentService, RunnerSettings, TrialDetail } from './environment'; import { JobRestServer } from './jobRestServer'; import { StorageService } from './storageService'; diff --git a/src/nni_manager/training_service/pai/reusable/forwardTrainingService.ts b/src/nni_manager/training_service/reusable/forwardTrainingService.ts similarity index 94% rename from src/nni_manager/training_service/pai/reusable/forwardTrainingService.ts rename to src/nni_manager/training_service/reusable/forwardTrainingService.ts index ccc58c4aed..b6944dadef 100644 --- a/src/nni_manager/training_service/pai/reusable/forwardTrainingService.ts +++ b/src/nni_manager/training_service/reusable/forwardTrainingService.ts @@ -19,13 +19,13 @@ 'use strict'; -import * as component from '../../../common/component'; -import { getLogger, Logger } from '../../../common/log'; -import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../../common/trainingService'; -import { delay } from '../../../common/utils'; -import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; -import { PAIClusterConfig } from '../paiConfig'; -import { PAIK8STrainingService } from '../paiK8S/paiK8STrainingService'; +import * as component from '../../common/component'; +import { getLogger, Logger } from '../../common/log'; +import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; +import { delay } from '../../common/utils'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { PAIClusterConfig } from '../pai/paiConfig'; +import { PAIK8STrainingService } from '../pai/paiK8S/paiK8STrainingService'; import { EnvironmentManager } from './environmentManager'; import { Container, Scope } from 'typescript-ioc'; import { EnvironmentService } from './environment'; diff --git a/src/nni_manager/training_service/pai/reusable/jobRestServer.ts b/src/nni_manager/training_service/reusable/jobRestServer.ts similarity index 97% rename from src/nni_manager/training_service/pai/reusable/jobRestServer.ts rename to src/nni_manager/training_service/reusable/jobRestServer.ts index 133d80942d..51e897b8ab 100644 --- a/src/nni_manager/training_service/pai/reusable/jobRestServer.ts +++ b/src/nni_manager/training_service/reusable/jobRestServer.ts @@ -21,7 +21,7 @@ import { EventEmitter } from 'events'; import { Request, Response, Router } from 'express'; -import { ClusterJobRestServer } from '../../common/clusterJobRestServer'; +import { ClusterJobRestServer } from '../common/clusterJobRestServer'; export interface ParameterFileMeta { readonly experimentId: string; diff --git a/src/nni_manager/training_service/pai/reusable/mountedStorageService.ts b/src/nni_manager/training_service/reusable/mountedStorageService.ts similarity index 100% rename from src/nni_manager/training_service/pai/reusable/mountedStorageService.ts rename to src/nni_manager/training_service/reusable/mountedStorageService.ts diff --git a/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts similarity index 98% rename from src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts rename to src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index 772d118e5c..ecbc871105 100644 --- a/src/nni_manager/training_service/pai/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -22,12 +22,12 @@ import * as fs from 'fs'; import * as request from 'request'; import { Deferred } from 'ts-deferred'; -import * as component from '../../../common/component'; -import { getExperimentId } from '../../../common/experimentStartupInfo'; -import { getLogger, Logger } from '../../../common/log'; -import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; -import { PAIClusterConfig } from '../paiConfig'; -import { NNIPAIK8STrialConfig } from '../paiK8S/paiK8SConfig'; +import * as component from '../../common/component'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../common/log'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { PAIClusterConfig } from '../pai/paiConfig'; +import { NNIPAIK8STrialConfig } from '../pai/paiK8S/paiK8SConfig'; import { EnvironmentInformation, EnvironmentService } from './environment'; import { StorageService } from './storageService'; diff --git a/src/nni_manager/training_service/pai/reusable/storageService.ts b/src/nni_manager/training_service/reusable/storageService.ts similarity index 98% rename from src/nni_manager/training_service/pai/reusable/storageService.ts rename to src/nni_manager/training_service/reusable/storageService.ts index f4ed872054..ed684c1d94 100644 --- a/src/nni_manager/training_service/pai/reusable/storageService.ts +++ b/src/nni_manager/training_service/reusable/storageService.ts @@ -19,12 +19,12 @@ 'use strict'; -import { uniqueString } from '../../../common/utils'; +import { uniqueString } from '../../common/utils'; import * as fs from 'fs'; import * as os from 'os'; import * as path from 'path'; -import { Logger, getLogger } from '../../../common/log'; -import { tarAdd } from '../../common/util'; +import { Logger, getLogger } from '../../common/log'; +import { tarAdd } from '../common/util'; export abstract class StorageService { protected localRoot: string = ""; From c67b162bce3d6a2f2eac047a2bc6430479530eb1 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Thu, 4 Jun 2020 16:38:49 +0800 Subject: [PATCH 09/98] support multi nodes rename methods of storageService move trial to a seperated file fix some bugs. --- .../training_service/reusable/environment.ts | 9 +- .../reusable/environmentManager.ts | 103 ++++++++++------ .../reusable/mountedStorageService.ts | 10 ++ .../reusable/openPaiEnvironmentService.ts | 26 ++-- .../reusable/storageService.ts | 25 ++-- tools/nni_trial_tool/protocol.py | 43 ++++--- tools/nni_trial_tool/trial.py | 111 +++++++++++++++++ tools/nni_trial_tool/trial_runner.py | 115 ++---------------- 8 files changed, 255 insertions(+), 187 deletions(-) create mode 100644 tools/nni_trial_tool/trial.py diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 2c4c7b8867..3d2b50ece2 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -45,7 +45,7 @@ export class TrialDetail implements TrialJobDetail { public isEarlyStopped?: boolean; public environment?: EnvironmentInformation; - private readonly TRIAL_METADATA_DIR = ".nni"; + public readonly TRIAL_METADATA_DIR = ".nni"; constructor(id: string, status: TrialJobStatus, submitTime: number, workingDirectory: string, form: TrialJobApplicationForm) { @@ -56,11 +56,6 @@ export class TrialDetail implements TrialJobDetail { this.form = form; this.tags = []; } - - public getExitCodeFileName(): string { - const storageService = component.get(StorageService); - return storageService.joinRemotePath(this.workingDirectory, this.TRIAL_METADATA_DIR, "code"); - } } export class RunnerSettings { @@ -92,7 +87,7 @@ export class EnvironmentInformation { public workingFolder: string = ""; public runnerWorkingFolder: string = ""; public command: string = ""; - public serverCount: number = 1; + public nodeCount: number = 1; constructor(id: string, jobName: string, jobId?: string) { this.id = id; diff --git a/src/nni_manager/training_service/reusable/environmentManager.ts b/src/nni_manager/training_service/reusable/environmentManager.ts index 5077fe1df3..99f335b9ee 100644 --- a/src/nni_manager/training_service/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/reusable/environmentManager.ts @@ -96,7 +96,7 @@ class EnvironmentManager implements TrainingService { const storageService = component.get(StorageService); const trialId: string = uniqueString(5); - const trialWorkingFolder: string = storageService.joinRemotePath('trials', trialId); + const trialWorkingFolder: string = storageService.joinPath('trials', trialId); const trialJobDetail: TrialDetail = new TrialDetail(trialId, "WAITING", Date.now(), trialWorkingFolder, form); this.trials.set(trialId, trialJobDetail); @@ -109,7 +109,7 @@ class EnvironmentManager implements TrainingService { const trialDetail = await this.getTrialJob(trialJobId); const storageService = component.get(StorageService); - const fileName = storageService.joinRemotePath(trialDetail.workingDirectory, generateParamFileName(form.hyperParameters)) + const fileName = storageService.joinPath(trialDetail.workingDirectory, generateParamFileName(form.hyperParameters)) // Write file content ( parameter.cfg ) to working folders await storageService.save(form.hyperParameters.value, fileName); @@ -151,13 +151,14 @@ class EnvironmentManager implements TrainingService { const storageService = component.get(StorageService); // Copy the compressed file to remoteDirectory and delete it const codeDir = path.resolve(this.trialConfig.codeDir); - const codeFileName = await storageService.copyDirectory(codeDir, "envs", true); - storageService.renameRemote(codeFileName, "nni-code.tar.gz"); + const envDir = storageService.joinPath("envs"); + const codeFileName = await storageService.copyDirectory(codeDir, envDir, true); + storageService.rename(codeFileName, "nni-code.tar.gz"); - const installFileName = storageService.joinRemotePath("envs", 'install_nni.sh'); + const installFileName = storageService.joinPath(envDir, 'install_nni.sh'); await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName); - const runnerSettings = storageService.joinRemotePath("envs", "settings.json"); + const runnerSettings = storageService.joinPath(envDir, "settings.json"); await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); this.log.info(`Environment Manager run loop started.`); @@ -191,9 +192,6 @@ class EnvironmentManager implements TrainingService { case TrialConfigMetadataKey.LOG_COLLECTION: this.runnerSettings.logCollection = value; break; - case TrialConfigMetadataKey.MULTI_PHASE: - // not useful, dismiss it. - break; case TrialConfigMetadataKey.TRIAL_CONFIG: // TODO to support more storage types by better parameters. this.trialConfig = JSON.parse(value); @@ -239,12 +237,12 @@ class EnvironmentManager implements TrainingService { let findingName: boolean = true; const command = encodeCommand(commantType, JSON.stringify(data)); const storageService = component.get(StorageService); - const commandPath = storageService.joinRemotePath(environment.workingFolder, `commands`); + const commandPath = storageService.joinPath(environment.workingFolder, `commands`); while (findingName) { fileName = `manager_command_${new Date().getTime()}.txt`; - filePath = storageService.joinRemotePath(commandPath, fileName); - if (!await storageService.existsRemote(filePath)) { + filePath = storageService.joinPath(commandPath, fileName); + if (!await storageService.exists(filePath)) { findingName = false; break; } @@ -268,7 +266,7 @@ class EnvironmentManager implements TrainingService { environments.push(environment); } }); - environmentService.updateEnvironmentsStatus(environments); + await environmentService.updateEnvironmentsStatus(environments); environments.forEach((environment) => { const oldIsAlive = environment.isAlive; @@ -301,41 +299,66 @@ class EnvironmentManager implements TrainingService { switch (currentStatus) { case "RUNNING": { - // check status consistence with environment. const environment = trial.environment; + if (environment === undefined) { this.log.error(`found running trial ${trial.id} has no environment, set trial to UNKNOWN.`); trial.status = "UNKNOWN"; - } else if (environment.status !== "RUNNING") { - this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environment.status}', set trial to environment status.`); - this.releaseEnvironment(trial); - trial.status = environment.status; + liveTrialsCount++; + continue; } - // check if it's done. - const fileName = trial.getExitCodeFileName(); - - if (await storageService.existsRemote(fileName) === true) { - const fileContent = await storageService.readRemoteFile(fileName); - const match: RegExpMatchArray | null = fileContent.trim() - .match(/^-?(\d+)\s+(\d+)$/); - if (match !== null) { - const { 1: code, 2: timestamp } = match; - - if (trial.status == currentStatus) { - // Update trial job's status based on result code - if (parseInt(code, 10) === 0) { - trial.status = 'SUCCEEDED'; - } else { - trial.status = 'FAILED'; + const codeFilePath = storageService.joinPath(trial.workingDirectory, trial.TRIAL_METADATA_DIR); + const remoteFiles = await storageService.listDirectory(codeFilePath); + + let isCompleted = false; + let latestTimestamp = 0; + let aggregatedCode = 0; + let completedCount = 0; + for (const fileName of remoteFiles) { + if (fileName.startsWith("code")) { + const fullName = storageService.joinPath(codeFilePath, fileName) + const fileContent = await storageService.readFileContent(fullName); + + const match: RegExpMatchArray | null = fileContent.trim().match(/^-?(\d+)\s+(\d+)$/); + if (match !== null) { + const { 1: code, 2: timestamp } = match; + const intCode = parseInt(code, 10) + latestTimestamp = Math.max(latestTimestamp, parseInt(timestamp, 10)); + if (intCode !== 0) { + // only save latest non-zero exit code + aggregatedCode = intCode; } + completedCount++; + } + } + } + + // for multiple running nodes, all completed mean complete. + // any failed node means failed. + // use <= for some wired cases. + if (environment.nodeCount <= completedCount) { + + if (trial.status == currentStatus) { + // Update trial job's status based on result code + if (aggregatedCode === 0) { + trial.status = 'SUCCEEDED'; + } else { + trial.status = 'FAILED'; } - trial.endTime = parseInt(timestamp, 10); + } + trial.endTime = latestTimestamp; + this.releaseEnvironment(trial); + isCompleted = true; + } + + if (isCompleted === false) { + // check status consistence with environment. + if (environment.status !== "RUNNING") { + this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environment.status}', set trial to environment status.`); this.releaseEnvironment(trial); - } else { - liveTrialsCount++; + trial.status = environment.status; } - } else { liveTrialsCount++; } } @@ -386,7 +409,7 @@ class EnvironmentManager implements TrainingService { const name = `nni_exp_${this.experimentId}_env_${envId}`; const environment = new EnvironmentInformation(envId, name); - environment.workingFolder = storageService.joinRemotePath("envs", envId); + environment.workingFolder = storageService.joinPath("envs", envId); environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; await storageService.createDirectory(environment.workingFolder); @@ -394,7 +417,7 @@ class EnvironmentManager implements TrainingService { const isDebuging = true; if (isDebuging) { // environment.status = "RUNNING"; - await storageService.copyDirectory("D:\\code\\nni\\tools\\nni_trial_tool", environment.workingFolder); + await storageService.copyDirectory("../nni/tools/nni_trial_tool", environment.workingFolder); } this.environments.set(environment.id, environment); diff --git a/src/nni_manager/training_service/reusable/mountedStorageService.ts b/src/nni_manager/training_service/reusable/mountedStorageService.ts index 19e600ff70..1ef61cbcf9 100644 --- a/src/nni_manager/training_service/reusable/mountedStorageService.ts +++ b/src/nni_manager/training_service/reusable/mountedStorageService.ts @@ -128,6 +128,16 @@ export class MountedStorageService extends StorageService { } + protected async internalList(remotePath: string): Promise { + let results: string[] = []; + + if (this.internalExists(remotePath)) { + results = await fs.promises.readdir(remotePath); + } + + return results; + } + protected internalIsRelativePath(remotePath: string): boolean { return !path.isAbsolute(remotePath); } diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index ecbc871105..01b0bb3b67 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -78,7 +78,7 @@ export class OpenPaiEnvironmentService implements EnvironmentService { request(getJobInfoRequest, async (error: any, response: request.Response, body: any) => { if ((error !== undefined && error !== null) || response.statusCode >= 400) { - this.log.error(`PAI Training service: get environment info from PAI Cluster failed!\nerror: ${error}`); + this.log.error(`OpenPAI: get environment list from PAI Cluster failed!\nerror: ${error}`); deferred.reject(error); } else { const jobInfos = new Map(); @@ -201,7 +201,7 @@ export class OpenPaiEnvironmentService implements EnvironmentService { request(stopJobRequest, (error, response, _body) => { try { if ((error !== undefined && error !== null) || (response && response.statusCode >= 400)) { - this.log.error(`OpenPAI Training service: stop job ${environment.jobId} failed with ${response.statusCode}\n${error}`); + this.log.error(`OpenPAI: stop job ${environment.jobId} failed with ${response.statusCode}\n${error}`); deferred.reject((error !== undefined && error !== null) ? error : `Stop trial failed, http code: ${response.statusCode}`); } else { @@ -243,7 +243,7 @@ export class OpenPaiEnvironmentService implements EnvironmentService { // Validate to make sure codeDir doesn't have too many files const storageService = component.get(StorageService); - const remoteRoot = storageService.joinRemotePath(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId); + const remoteRoot = storageService.joinPath(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId); storageService.initialize(this.paiTrialConfig.nniManagerNFSMountPath, remoteRoot); if (this.paiTrialConfig.paiConfigPath) { @@ -283,18 +283,26 @@ export class OpenPaiEnvironmentService implements EnvironmentService { nniJobConfig.name = jobName; if (nniJobConfig.taskRoles) { - // Each taskRole will generate new command in NNI's command format - // Each command will be formatted to NNI style - for (const taskRoleIndex in nniJobConfig.taskRoles) { - const taskRole = nniJobConfig.taskRoles[taskRoleIndex]; + environment.nodeCount = 0; + // count instance + for (const taskRoleName in nniJobConfig.taskRoles) { + const taskRole = nniJobConfig.taskRoles[taskRoleName]; let instanceCount = 1; if (taskRole.instances) { instanceCount = taskRole.instances; } + environment.nodeCount += instanceCount; + } - environment.serverCount += instanceCount; - const nniTrialCommand = `${environment.command} ${taskRole.commands.join(" && ").replace(/(["'$`\\])/g, '\\$1')}`; + // Each taskRole will generate new command in NNI's command format + // Each command will be formatted to NNI style + for (const taskRoleName in nniJobConfig.taskRoles) { + const taskRole = nniJobConfig.taskRoles[taskRoleName]; + // replace ' to '\'' + const joinedCommand = taskRole.commands.join(" && ").replace("'", "'\\''"); + let nniTrialCommand = `${environment.command} --node_count ${environment.nodeCount} --trial_command '${joinedCommand.trim()}'`; + this.log.debug(`replace command ${taskRole.commands} to ${[nniTrialCommand]}`); taskRole.commands = [nniTrialCommand]; } } diff --git a/src/nni_manager/training_service/reusable/storageService.ts b/src/nni_manager/training_service/reusable/storageService.ts index ed684c1d94..402203e5c4 100644 --- a/src/nni_manager/training_service/reusable/storageService.ts +++ b/src/nni_manager/training_service/reusable/storageService.ts @@ -38,6 +38,7 @@ export abstract class StorageService { protected abstract async internalCopy(sourcePath: string, targetPath: string, isDirectory: boolean, isFromRemote: boolean, isToRemote: boolean): Promise; protected abstract async internalExists(remotePath: string): Promise; protected abstract async internalRead(remotePath: string, offset: number, length: number): Promise; + protected abstract async internalList(remotePath: string): Promise; protected abstract internalIsRelativePath(path: string): boolean; protected abstract internalJoin(...paths: string[]): string; protected abstract internalDirname(...paths: string[]): string; @@ -53,7 +54,7 @@ export abstract class StorageService { this.remoteRoot = remoteRoot; } - public async renameRemote(remotePath: string, newName: string): Promise { + public async rename(remotePath: string, newName: string): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`rename remotePath: ${remotePath} to: ${newName}`); await this.internalRename(remotePath, newName); @@ -69,7 +70,7 @@ export abstract class StorageService { localPath = this.expandPath(false, localPath); remotePath = this.expandPath(true, remotePath); this.logger.debug(`copy localPath: ${localPath} to remotePath: ${remotePath}, asGzip ${asGzip}`); - if (!await this.existsRemote(remotePath)) { + if (!await this.exists(remotePath)) { await this.internalMkdir(remotePath); } @@ -106,16 +107,22 @@ export abstract class StorageService { await this.internalRemove(remotePath, true, isRecursive); } - public async readRemoteFile(remotePath: string, offset: number = -1, length: number = -1): Promise { + public async readFileContent(remotePath: string, offset: number = -1, length: number = -1): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`read remote file: ${remotePath}, offset: ${offset}, length: ${length}`); return this.internalRead(remotePath, offset, length); } - public async existsRemote(remotePath: string): Promise { + public async listDirectory(remotePath: string): Promise { + remotePath = this.expandPath(true, remotePath); + this.logger.debug(`list remotePath: ${remotePath}`); + return await this.internalList(remotePath); + } + + public async exists(remotePath: string): Promise { remotePath = this.expandPath(true, remotePath); const exists = await this.internalExists(remotePath); - this.logger.debug(`check exists remotePath: ${remotePath} is ${exists}`); + this.logger.debug(`exists remotePath: ${remotePath} is ${exists}`); return exists } @@ -135,14 +142,14 @@ export abstract class StorageService { } await fs.promises.writeFile(localTempFileName, content); await this.internalCopy(localTempFileName, remoteDir, false, false, true); - await this.renameRemote(remoteTempFile, fileName); + await this.rename(remoteTempFile, fileName); await fs.promises.unlink(localTempFileName); } public async copyFile(localPath: string, remotePath: string): Promise { localPath = this.expandPath(false, localPath); remotePath = this.expandPath(true, remotePath); - this.logger.debug(`copy file localPath: ${localPath} to remotePath: ${remotePath}`); + this.logger.debug(`copying file localPath: ${localPath} to remotePath: ${remotePath}`); await this.internalCopy(localPath, remotePath, false, false, true); } @@ -159,7 +166,7 @@ export abstract class StorageService { await this.internalRemove(remotePath, false, false); } - public joinRemotePath(...paths: string[]): string { + public joinPath(...paths: string[]): string { let fullPath = this.internalJoin(...paths); if (this.internalIsRelativePath(fullPath) === true && this.remoteRoot !== "") { fullPath = this.internalJoin(this.remoteRoot, fullPath); @@ -171,7 +178,7 @@ export abstract class StorageService { let normalizedPath: string; if (isRemote) { - normalizedPath = this.joinRemotePath(...paths); + normalizedPath = this.joinPath(...paths); } else { normalizedPath = path.join(...paths); if (!path.isAbsolute(normalizedPath) && this.localRoot !== "") { diff --git a/tools/nni_trial_tool/protocol.py b/tools/nni_trial_tool/protocol.py index f6863d198c..604d81c042 100644 --- a/tools/nni_trial_tool/protocol.py +++ b/tools/nni_trial_tool/protocol.py @@ -13,6 +13,8 @@ runner_command_prefix = "runner_command_" manager_command_prefix = "manager_command_" +parsed_commands = set() + class CommandType(Enum): Initialize = b'IN' @@ -55,7 +57,7 @@ def send(command, data): break -def receive(): +def receive(is_keep_parsed=True): """Receive a command from Training Service. Returns a tuple of command (CommandType) and payload (str) """ @@ -66,27 +68,28 @@ def receive(): pending_commands = [] if os.path.exists(command_path): command_files = os.listdir(command_path) - for item in command_files: - if (item.startswith(manager_command_prefix)): - pending_commands.append(item) + for file_name in command_files: + if (file_name.startswith(manager_command_prefix)) and file_name not in parsed_commands: + pending_commands.append(file_name) pending_commands.sort() - if len(pending_commands) > 0: - for command_file in pending_commands: - command_file = os.path.join(command_path, command_file) - with open(command_file, "rb") as _in_file: - header = _in_file.read(16) - nni_log(LogType.Info, 'Received command, header: [%s]' % header) - if header is None or len(header) < 16: - # invalid header - nni_log(LogType.Error, 'incorrect command is found!') - return None, None - length = int(header[2:]) - data = _in_file.read(length) - command = CommandType(header[:2]) - data = json.loads(data.decode('utf8')) - nni_log(LogType.Info, 'Received command, data: [%s]' % data) - os.remove(command_file) + for file_name in pending_commands: + full_file_name = os.path.join(command_path, file_name) + with open(full_file_name, "rb") as _in_file: + header = _in_file.read(16) + nni_log(LogType.Info, 'Received command, header: [%s]' % header) + if header is None or len(header) < 16: + # invalid header + nni_log(LogType.Error, 'incorrect command is found!') + return None, None + length = int(header[2:]) + data = _in_file.read(length) + command = CommandType(header[:2]) + data = json.loads(data.decode('utf8')) + nni_log(LogType.Info, 'Received command, data: [%s]' % data) + if not is_keep_parsed: + os.remove(full_file_name) + parsed_commands.add(file_name) except Exception as identifier: nni_log(LogType.Error, 'meet unhandled exception: %s' % identifier) return command, data diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py new file mode 100644 index 0000000000..fc3d3eab31 --- /dev/null +++ b/tools/nni_trial_tool/trial.py @@ -0,0 +1,111 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import ctypes +import os +import shlex +import tarfile +import random +from datetime import datetime +from subprocess import Popen + +import psutil + +from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log + +trial_output_path_name = ".nni" + + +class Trial: + def __init__(self, args, data): + self.process = None + self.data = data + self.args = args + self.trial_syslogger_stdout = None + + global NNI_TRIAL_JOB_ID + self.id = data["trialId"] + if self.id is None: + raise Exception("trial_id is not found in %s" % data) + os.environ['NNI_TRIAL_JOB_ID'] = self.id + NNI_TRIAL_JOB_ID = self.id + + def run(self): + # redirect trial runner's stdout and stderr to syslog + self.trial_syslogger_stdout = RemoteLogger(self.args.nnimanager_ip, self.args.nnimanager_port, 'trial', StdOutputType.Stdout, + self.args.log_collection, self.id) + + nni_log(LogType.Info, "start to run trial %s" % self.id) + + trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) + + os.environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) + os.environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") + os.environ['NNI_SYS_DIR'] = trial_working_dir + + self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) + os.makedirs(self.trial_output_dir, exist_ok=True) + trial_code_dir = os.path.join(trial_working_dir, "code") + os.makedirs(trial_code_dir, exist_ok=True) + + # prepare code + with tarfile.open(os.path.join("..", "nni-code.tar.gz"), "r:gz") as tar: + tar.extractall(trial_code_dir) + + # save parameters + nni_log(LogType.Info, 'saving parameter %s' % self.data["parameter"]["value"]) + parameter_file_name = os.path.join(trial_working_dir, "parameter.cfg") + with open(parameter_file_name, "w") as parameter_file: + parameter_file.write(self.data["parameter"]["value"]) + + # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior + self.log_pipe_stdout = self.trial_syslogger_stdout.get_pipelog_reader() + self.process = Popen(self.args.trial_command, shell=True, stdout=self.log_pipe_stdout, + stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=os.environ) + nni_log(LogType.Info, 'Trial runner spawns a subprocess (pid {0}) to run command: {1}'. + format(self.process.pid, shlex.split(self.args.trial_command))) + + def is_running(self): + if (self.process is None): + return False + + retCode = self.process.poll() + # child worker process exits and all stdout data is read + if retCode is not None and self.log_pipe_stdout.set_process_exit() and self.log_pipe_stdout.is_read_completed == True: + # In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError. + # So covert it to int32. + retCode = ctypes.c_long(retCode).value + nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode)) + + # Exit as the retCode of subprocess(trial) + exit_code_file_name = os.path.join(self.trial_output_dir, "code") + if (self.args.node_count > 1): + while True: + exit_code_file_name = "%s_%s" % (exit_code_file_name, random.randint(0, 10000)) + if not os.path.exists(exit_code_file_name): + break + with open(exit_code_file_name, "w") as exit_file: + exit_file.write("%s %s" % (retCode, int(datetime.now().timestamp() * 1000))) + self.cleanup() + return False + else: + return True + + def kill(self, trial_id=None): + if trial_id == self.id or trial_id is None: + if self.process is not None: + nni_log(LogType.Info, "killing trial %s" % self.id) + for child in psutil.Process(self.process.pid).children(True): + child.kill() + self.process.kill() + self.cleanup() + + def cleanup(self): + nni_log(LogType.Info, "clean up trial %s" % self.id) + self.process = None + if self.log_pipe_stdout is not None: + self.log_pipe_stdout.set_process_exit() + self.log_pipe_stdout = None + if self.trial_syslogger_stdout is not None: + self.trial_syslogger_stdout.close() + self.trial_syslogger_stdout = None diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index c77f63c16b..1635b7dec6 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -2,120 +2,21 @@ # Licensed under the MIT license. import argparse -import ctypes import json -import logging import os import re -import shlex import sys -import tarfile import threading import time from datetime import datetime -from subprocess import Popen import pkg_resources -import psutil idle_timeout_seconds = 10 * 60 - -logger = logging.getLogger('runner') regular = re.compile('v?(?P[0-9](\.[0-9]){0,1}).*') -trial_output_path_name = ".nni" trial_runner_syslogger = None -class Trial: - def __init__(self, args, data): - self.process = None - self.data = data - self.args = args - self.trial_syslogger_stdout = None - - global NNI_TRIAL_JOB_ID - self.id = data["trialId"] - if self.id is None: - raise Exception("trial_id is not found in %s" % data) - os.environ['NNI_TRIAL_JOB_ID'] = self.id - NNI_TRIAL_JOB_ID = self.id - - def run(self): - # redirect trial runner's stdout and stderr to syslog - self.trial_syslogger_stdout = RemoteLogger(self.args.nnimanager_ip, self.args.nnimanager_port, 'trial', StdOutputType.Stdout, - self.args.log_collection, self.id) - - nni_log(LogType.Info, "start to run trial %s" % self.id) - - trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) - - os.environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) - os.environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") - os.environ['NNI_SYS_DIR'] = trial_working_dir - - self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) - os.makedirs(self.trial_output_dir, exist_ok=True) - trial_code_dir = os.path.join(trial_working_dir, "code") - os.makedirs(trial_code_dir, exist_ok=True) - - # prepare code - with tarfile.open(os.path.join("..", "nni-code.tar.gz"), "r:gz") as tar: - tar.extractall(trial_code_dir) - - # save parameters - nni_log(LogType.Info, 'saving parameter %s' % self.data["parameter"]["value"]) - parameter_file_name = os.path.join(trial_working_dir, "parameter.cfg") - with open(parameter_file_name, "w") as parameter_file: - parameter_file.write(self.data["parameter"]["value"]) - - # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior - self.log_pipe_stdout = self.trial_syslogger_stdout.get_pipelog_reader() - self.process = Popen(self.args.trial_command, shell=True, stdout=self.log_pipe_stdout, - stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=os.environ) - nni_log(LogType.Info, 'Trial runner spawns a subprocess (pid {0}) to run command: {1}'. - format(self.process.pid, shlex.split(self.args.trial_command))) - - def is_running(self): - if (self.process is None): - return False - - retCode = self.process.poll() - # child worker process exits and all stdout data is read - if retCode is not None and self.log_pipe_stdout.set_process_exit() and self.log_pipe_stdout.is_read_completed == True: - # In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError. - # So covert it to int32. - retCode = ctypes.c_long(retCode).value - nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode)) - - # Exit as the retCode of subprocess(trial) - exit_code_file_name = os.path.join(self.trial_output_dir, "code") - with open(exit_code_file_name, "w") as exit_file: - exit_file.write("%s %s" % (retCode, int(datetime.now().timestamp() * 1000))) - self.cleanup() - return False - else: - return True - - def kill(self, trial_id=None): - if trial_id == self.id or trial_id is None: - if self.process is not None: - nni_log(LogType.Info, "killing trial %s" % self.id) - for child in psutil.Process(self.process.pid).children(True): - child.kill() - self.process.kill() - self.cleanup() - - def cleanup(self): - nni_log(LogType.Info, "clean up trial %s" % self.id) - self.process = None - if self.log_pipe_stdout is not None: - self.log_pipe_stdout.set_process_exit() - self.log_pipe_stdout = None - if self.trial_syslogger_stdout is not None: - self.trial_syslogger_stdout.close() - self.trial_syslogger_stdout = None - - def main_loop(args): '''main loop logic for trial runner''' idle_last_time = datetime.now() @@ -123,14 +24,18 @@ def main_loop(args): StdOutputType.Stdout, args.log_collection, args.runner_id) sys.stdout = sys.stderr = trial_runner_syslogger trial = None + is_multi_node = args.node_count > 1 try: # command loop while True: - command_type, command_data = receive() + command_type, command_data = receive(is_multi_node) if command_type == CommandType.NewTrialJob: if trial is not None: - raise Exception('trial %s is running already, cannot start a new one' % trial.trial_id) + if trial.is_running(): + raise Exception('trial %s is running already, cannot start a new one' % trial.id) + else: + trial = None trial = Trial(args, command_data) trial.run() elif command_type == CommandType.KillTrialJob: @@ -148,10 +53,11 @@ def main_loop(args): nni_log(LogType.Info, "trial runner is idle more than {0} seconds, so exit.".format( idle_timeout_seconds)) break - time.sleep(1) + time.sleep(0.5) except Exception as ex: nni_log(LogType.Error, ex) finally: + nni_log(LogType.Info, "main_loop exits.") if trial is not None: trial.kill() @@ -229,6 +135,7 @@ def run(self): PARSER.add_argument('--nnimanager_port', type=str, help='NNI manager rest server port') PARSER.add_argument('--nni_manager_version', type=str, help='the nni version transmitted from nniManager') PARSER.add_argument('--log_collection', type=str, help='set the way to collect log in trial runner') + PARSER.add_argument('--node_count', type=int, help='number of nodes, it determines how to consume command and save code file') args, unknown = PARSER.parse_known_args() setting_file = "../settings.json" @@ -251,6 +158,9 @@ def run(self): args.nni_manager_version = settings["nniManagerVersion"] if args.log_collection is None: args.log_collection = settings["logCollection"] + if args.node_count is None: + # default has only one node. + args.node_count = 1 os.environ['NNI_OUTPUT_DIR'] = os.curdir + "/nnioutput" os.environ['NNI_PLATFORM'] = args.platform @@ -263,6 +173,7 @@ def run(self): from .rest_utils import rest_get, rest_post from .url_utils import gen_parameter_meta_url, gen_send_version_url from .protocol import CommandType, receive + from .trial import Trial nni_log(LogType.Info, "merged args is {}".format(args)) From e13a6208761370c4eff4a2d1d67cdf5fb2d6e791 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Thu, 4 Jun 2020 17:06:25 +0800 Subject: [PATCH 10/98] fix eslint errors --- src/nni_manager/training_service/reusable/environment.ts | 2 -- .../training_service/reusable/openPaiEnvironmentService.ts | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 3d2b50ece2..9f8d601999 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -20,8 +20,6 @@ 'use strict'; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../common/trainingService"; -import { StorageService } from "./storageService"; -import * as component from '../../common/component'; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index 01b0bb3b67..7e6bebdd20 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -301,7 +301,7 @@ export class OpenPaiEnvironmentService implements EnvironmentService { const taskRole = nniJobConfig.taskRoles[taskRoleName]; // replace ' to '\'' const joinedCommand = taskRole.commands.join(" && ").replace("'", "'\\''"); - let nniTrialCommand = `${environment.command} --node_count ${environment.nodeCount} --trial_command '${joinedCommand.trim()}'`; + const nniTrialCommand = `${environment.command} --node_count ${environment.nodeCount} --trial_command '${joinedCommand.trim()}'`; this.log.debug(`replace command ${taskRole.commands} to ${[nniTrialCommand]}`); taskRole.commands = [nniTrialCommand]; } From 59d4a711389a8c070dd27881f69645d0cc1678c7 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Fri, 5 Jun 2020 11:17:16 +0800 Subject: [PATCH 11/98] support multi environments better fix openPAI breaking changes --- .../pai/paiK8S/paiK8STrainingService.ts | 7 +- .../reusable/environmentManager.ts | 88 +++++++++++-------- .../reusable/openPaiEnvironmentService.ts | 7 +- tools/nni_trial_tool/trial.py | 73 +++++++++------ tools/nni_trial_tool/trial_runner.py | 19 +++- 5 files changed, 129 insertions(+), 65 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 0c92c35ed7..d23ec3a021 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -201,6 +201,8 @@ class PAIK8STrainingService extends PAITrainingService { } } else { + const containerPathParts = this.paiTrialConfig.containerNFSMountPath.split("/"); + const containerPathName = containerPathParts[containerPathParts.length - 1]; nniJobConfig = { protocolVersion: 2, name: jobName, @@ -235,7 +237,10 @@ class PAIK8STrainingService extends PAITrainingService { extras: { 'com.microsoft.pai.runtimeplugin': [ { - plugin: this.paiTrialConfig.paiStoragePlugin + plugin: this.paiTrialConfig.paiStoragePlugin, + parameters: [ + containerPathName + ] } ], submitFrom: 'submit-job-v2' diff --git a/src/nni_manager/training_service/reusable/environmentManager.ts b/src/nni_manager/training_service/reusable/environmentManager.ts index 99f335b9ee..4bea8161e4 100644 --- a/src/nni_manager/training_service/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/reusable/environmentManager.ts @@ -308,56 +308,70 @@ class EnvironmentManager implements TrainingService { continue; } + let isCompleted = false; + let remoteFiles: string[] = []; + const codeFilePath = storageService.joinPath(trial.workingDirectory, trial.TRIAL_METADATA_DIR); - const remoteFiles = await storageService.listDirectory(codeFilePath); + // the folder may not exist at initial stage. Just ignore, if it doesn't exists; + if (await storageService.exists(codeFilePath)) { + remoteFiles = await storageService.listDirectory(codeFilePath); + } - let isCompleted = false; - let latestTimestamp = 0; - let aggregatedCode = 0; - let completedCount = 0; - for (const fileName of remoteFiles) { - if (fileName.startsWith("code")) { - const fullName = storageService.joinPath(codeFilePath, fileName) - const fileContent = await storageService.readFileContent(fullName); - - const match: RegExpMatchArray | null = fileContent.trim().match(/^-?(\d+)\s+(\d+)$/); - if (match !== null) { - const { 1: code, 2: timestamp } = match; - const intCode = parseInt(code, 10) - latestTimestamp = Math.max(latestTimestamp, parseInt(timestamp, 10)); - if (intCode !== 0) { - // only save latest non-zero exit code - aggregatedCode = intCode; + if (remoteFiles.length > 0) { + let latestTimestamp = 0; + let aggregatedCode = 0; + let completedCount = 0; + for (const fileName of remoteFiles) { + if (fileName.startsWith("code")) { + const fullName = storageService.joinPath(codeFilePath, fileName) + const fileContent = await storageService.readFileContent(fullName); + + const match: RegExpMatchArray | null = fileContent.trim().match(/^-?(\d+)\s+(\d+)$/); + if (match !== null) { + const { 1: code, 2: timestamp } = match; + const intCode = parseInt(code, 10) + latestTimestamp = Math.max(latestTimestamp, parseInt(timestamp, 10)); + if (intCode !== 0) { + // only save latest non-zero exit code + aggregatedCode = intCode; + } + completedCount++; } - completedCount++; } } - } - // for multiple running nodes, all completed mean complete. - // any failed node means failed. - // use <= for some wired cases. - if (environment.nodeCount <= completedCount) { - - if (trial.status == currentStatus) { - // Update trial job's status based on result code - if (aggregatedCode === 0) { - trial.status = 'SUCCEEDED'; - } else { - trial.status = 'FAILED'; + // for multiple running nodes, if one node is done, thought it's done. + // any failed node means failed. + // use <= for some wired cases. + if (completedCount > 0) { + this.log.debug(`found ${completedCount} completed trial process(es), nodeCount: ${environment.nodeCount}`); + + // if some trial processes doesn't exit, kill it for next one. + // for example, in horovod, it's just sleep command, has no impact on trial result. + if (environment.nodeCount >= completedCount){ + this.sendCommand(KILL_TRIAL_JOB, trial.id, environment); + } + if (trial.status == currentStatus) { + // Update trial job's status based on result code + if (aggregatedCode === 0) { + trial.status = 'SUCCEEDED'; + } else { + trial.status = 'FAILED'; + } } + trial.endTime = latestTimestamp; + this.releaseEnvironment(trial); + isCompleted = true; } - trial.endTime = latestTimestamp; - this.releaseEnvironment(trial); - isCompleted = true; } if (isCompleted === false) { // check status consistence with environment. - if (environment.status !== "RUNNING") { - this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environment.status}', set trial to environment status.`); + const environmentStatus = environment.status; + if (environmentStatus !== "RUNNING") { + this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environmentStatus}', set trial to environment status.`); this.releaseEnvironment(trial); - trial.status = environment.status; + trial.status = environmentStatus; } liveTrialsCount++; } diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index 7e6bebdd20..e84c6909dc 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -308,6 +308,8 @@ export class OpenPaiEnvironmentService implements EnvironmentService { } } else { + const containerPathParts = this.paiTrialConfig.containerNFSMountPath.split("/"); + const containerPathName = containerPathParts[containerPathParts.length - 1]; nniJobConfig = { protocolVersion: 2, name: jobName, @@ -342,7 +344,10 @@ export class OpenPaiEnvironmentService implements EnvironmentService { extras: { 'com.microsoft.pai.runtimeplugin': [ { - plugin: this.paiTrialConfig.paiStoragePlugin + plugin: this.paiTrialConfig.paiStoragePlugin, + parameters: [ + containerPathName + ] } ], submitFrom: 'submit-job-v2' diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index fc3d3eab31..f6ecd393cf 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -3,9 +3,10 @@ import ctypes import os +import random import shlex import tarfile -import random +import time from datetime import datetime from subprocess import Popen @@ -30,40 +31,62 @@ def __init__(self, args, data): os.environ['NNI_TRIAL_JOB_ID'] = self.id NNI_TRIAL_JOB_ID = self.id + # for multiple nodes. If it's None, it means single node. + self.node_id = args.node_id + if self.node_id is None: + self.name = self.id + else: + self.name = "%s_%s" % (self.id, self.node_id) + def run(self): - # redirect trial runner's stdout and stderr to syslog + # redirect trial's stdout and stderr to syslog self.trial_syslogger_stdout = RemoteLogger(self.args.nnimanager_ip, self.args.nnimanager_port, 'trial', StdOutputType.Stdout, - self.args.log_collection, self.id) + self.args.log_collection, self.name) - nni_log(LogType.Info, "start to run trial %s" % self.id) + nni_log(LogType.Info, "%s: start to run trial" % self.name) trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) + self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) + trial_code_dir = os.path.join(trial_working_dir, "code") os.environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) os.environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") os.environ['NNI_SYS_DIR'] = trial_working_dir - self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) - os.makedirs(self.trial_output_dir, exist_ok=True) - trial_code_dir = os.path.join(trial_working_dir, "code") - os.makedirs(trial_code_dir, exist_ok=True) - - # prepare code - with tarfile.open(os.path.join("..", "nni-code.tar.gz"), "r:gz") as tar: - tar.extractall(trial_code_dir) - - # save parameters - nni_log(LogType.Info, 'saving parameter %s' % self.data["parameter"]["value"]) - parameter_file_name = os.path.join(trial_working_dir, "parameter.cfg") - with open(parameter_file_name, "w") as parameter_file: - parameter_file.write(self.data["parameter"]["value"]) + # prepare code and parameters + prepared_flag_file_name = os.path.join(trial_working_dir, "trial_prepared") + if not os.path.exists(trial_working_dir): + os.makedirs(trial_working_dir, exist_ok=True) + + os.makedirs(self.trial_output_dir, exist_ok=True) + # prepare code + os.makedirs(trial_code_dir, exist_ok=True) + with tarfile.open(os.path.join("..", "nni-code.tar.gz"), "r:gz") as tar: + tar.extractall(trial_code_dir) + + # save parameters + nni_log(LogType.Info, '%s: saving parameter %s' % (self.name, self.data["parameter"]["value"])) + parameter_file_name = os.path.join(trial_working_dir, "parameter.cfg") + with open(parameter_file_name, "w") as parameter_file: + parameter_file.write(self.data["parameter"]["value"]) + + # ready flag + with open(prepared_flag_file_name, "w") as prepared_flag_file: + prepared_flag_file.write("%s" % (int(datetime.now().timestamp() * 1000))) + + # make sure code prepared by other node. + if self.node_id is not None: + while True: + if os.path.exists(prepared_flag_file_name): + break + time.sleep(0.1) # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior self.log_pipe_stdout = self.trial_syslogger_stdout.get_pipelog_reader() self.process = Popen(self.args.trial_command, shell=True, stdout=self.log_pipe_stdout, stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=os.environ) - nni_log(LogType.Info, 'Trial runner spawns a subprocess (pid {0}) to run command: {1}'. - format(self.process.pid, shlex.split(self.args.trial_command))) + nni_log(LogType.Info, '{0}: spawns a subprocess (pid {1}) to run command: {2}'. + format(self.name, self.process.pid, shlex.split(self.args.trial_command))) def is_running(self): if (self.process is None): @@ -75,13 +98,13 @@ def is_running(self): # In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError. # So covert it to int32. retCode = ctypes.c_long(retCode).value - nni_log(LogType.Info, 'subprocess terminated. Exit code is {}. Quit'.format(retCode)) + nni_log(LogType.Info, '{0}: subprocess terminated. Exit code is {1}. Quit'.format(self.name, retCode)) # Exit as the retCode of subprocess(trial) exit_code_file_name = os.path.join(self.trial_output_dir, "code") - if (self.args.node_count > 1): + if (self.node_id is not None): while True: - exit_code_file_name = "%s_%s" % (exit_code_file_name, random.randint(0, 10000)) + exit_code_file_name = "%s_%s" % (exit_code_file_name, self.node_id) if not os.path.exists(exit_code_file_name): break with open(exit_code_file_name, "w") as exit_file: @@ -94,14 +117,14 @@ def is_running(self): def kill(self, trial_id=None): if trial_id == self.id or trial_id is None: if self.process is not None: - nni_log(LogType.Info, "killing trial %s" % self.id) + nni_log(LogType.Info, "%s: killing trial" % self.name) for child in psutil.Process(self.process.pid).children(True): child.kill() self.process.kill() self.cleanup() def cleanup(self): - nni_log(LogType.Info, "clean up trial %s" % self.id) + nni_log(LogType.Info, "%s: clean up trial" % self.name) self.process = None if self.log_pipe_stdout is not None: self.log_pipe_stdout.set_process_exit() diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 1635b7dec6..1416767b9a 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -4,6 +4,7 @@ import argparse import json import os +import random import re import sys import threading @@ -20,11 +21,24 @@ def main_loop(args): '''main loop logic for trial runner''' idle_last_time = datetime.now() + is_multi_node = args.node_count > 1 + + if (is_multi_node): + # for multiple nodes, create a file to get a unique id. + while True: + node_id = random.randint(0, 10000) + unique_check_file_name = "node_%s" % (node_id) + if not os.path.exists(unique_check_file_name): + break + with open(unique_check_file_name, "w") as unique_check_file: + unique_check_file.write("%s" % (int(datetime.now().timestamp() * 1000))) + args.node_id = node_id + args.runner_id = "%s_%s" % (args.runner_id, node_id) + trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'runner', StdOutputType.Stdout, args.log_collection, args.runner_id) sys.stdout = sys.stderr = trial_runner_syslogger trial = None - is_multi_node = args.node_count > 1 try: # command loop @@ -146,7 +160,10 @@ def run(self): args.exp_id = settings["experimentId"] args.platform = settings["platform"] + # runner_id is unique node in experiment, and will be updated if it's multi-nodes args.runner_id = "runner_"+os.path.basename(os.path.realpath(os.path.curdir)) + # node id is unique in the runner + args.node_id = None if args.trial_command is None: args.trial_command = settings["command"] From 81c49cf8daa73229ce896039100489d7c4bdbbc3 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Fri, 5 Jun 2020 14:49:07 +0800 Subject: [PATCH 12/98] code refactor fix minor bugs --- .../reusable/environmentManager.ts | 7 +- .../reusable/mountedStorageService.ts | 2 +- .../reusable/openPaiEnvironmentService.ts | 4 +- tools/nni_trial_tool/trial.py | 7 +- tools/nni_trial_tool/trial_runner.py | 64 ++++++++++--------- 5 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environmentManager.ts b/src/nni_manager/training_service/reusable/environmentManager.ts index 4bea8161e4..c8038a7753 100644 --- a/src/nni_manager/training_service/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/reusable/environmentManager.ts @@ -312,10 +312,7 @@ class EnvironmentManager implements TrainingService { let remoteFiles: string[] = []; const codeFilePath = storageService.joinPath(trial.workingDirectory, trial.TRIAL_METADATA_DIR); - // the folder may not exist at initial stage. Just ignore, if it doesn't exists; - if (await storageService.exists(codeFilePath)) { - remoteFiles = await storageService.listDirectory(codeFilePath); - } + remoteFiles = await storageService.listDirectory(codeFilePath); if (remoteFiles.length > 0) { let latestTimestamp = 0; @@ -348,7 +345,7 @@ class EnvironmentManager implements TrainingService { // if some trial processes doesn't exit, kill it for next one. // for example, in horovod, it's just sleep command, has no impact on trial result. - if (environment.nodeCount >= completedCount){ + if (environment.nodeCount >= completedCount) { this.sendCommand(KILL_TRIAL_JOB, trial.id, environment); } if (trial.status == currentStatus) { diff --git a/src/nni_manager/training_service/reusable/mountedStorageService.ts b/src/nni_manager/training_service/reusable/mountedStorageService.ts index 1ef61cbcf9..9c4c6f9d13 100644 --- a/src/nni_manager/training_service/reusable/mountedStorageService.ts +++ b/src/nni_manager/training_service/reusable/mountedStorageService.ts @@ -131,7 +131,7 @@ export class MountedStorageService extends StorageService { protected async internalList(remotePath: string): Promise { let results: string[] = []; - if (this.internalExists(remotePath)) { + if (await this.internalExists(remotePath) === true) { results = await fs.promises.readdir(remotePath); } diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index e84c6909dc..4099249ade 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -300,8 +300,8 @@ export class OpenPaiEnvironmentService implements EnvironmentService { for (const taskRoleName in nniJobConfig.taskRoles) { const taskRole = nniJobConfig.taskRoles[taskRoleName]; // replace ' to '\'' - const joinedCommand = taskRole.commands.join(" && ").replace("'", "'\\''"); - const nniTrialCommand = `${environment.command} --node_count ${environment.nodeCount} --trial_command '${joinedCommand.trim()}'`; + const joinedCommand = taskRole.commands.join(" && ").replace("'", "'\\''").trim(); + const nniTrialCommand = `${environment.command} --node_count ${environment.nodeCount} --trial_command '${joinedCommand}'`; this.log.debug(`replace command ${taskRole.commands} to ${[nniTrialCommand]}`); taskRole.commands = [nniTrialCommand]; } diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index f6ecd393cf..ff9584e850 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -3,7 +3,6 @@ import ctypes import os -import random import shlex import tarfile import time @@ -41,13 +40,14 @@ def __init__(self, args, data): def run(self): # redirect trial's stdout and stderr to syslog self.trial_syslogger_stdout = RemoteLogger(self.args.nnimanager_ip, self.args.nnimanager_port, 'trial', StdOutputType.Stdout, - self.args.log_collection, self.name) + self.args.log_collection, self.id) nni_log(LogType.Info, "%s: start to run trial" % self.name) trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) trial_code_dir = os.path.join(trial_working_dir, "code") + trial_nnioutput_dir = os.path.join(trial_working_dir, "nnioutput") os.environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) os.environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") @@ -59,6 +59,7 @@ def run(self): os.makedirs(trial_working_dir, exist_ok=True) os.makedirs(self.trial_output_dir, exist_ok=True) + os.makedirs(trial_nnioutput_dir, exist_ok=True) # prepare code os.makedirs(trial_code_dir, exist_ok=True) with tarfile.open(os.path.join("..", "nni-code.tar.gz"), "r:gz") as tar: @@ -84,7 +85,7 @@ def run(self): # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior self.log_pipe_stdout = self.trial_syslogger_stdout.get_pipelog_reader() self.process = Popen(self.args.trial_command, shell=True, stdout=self.log_pipe_stdout, - stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=os.environ) + stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=dict(os.environ)) nni_log(LogType.Info, '{0}: spawns a subprocess (pid {1}) to run command: {2}'. format(self.name, self.process.pid, shlex.split(self.args.trial_command))) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 1416767b9a..8bc00b5076 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -23,21 +23,6 @@ def main_loop(args): idle_last_time = datetime.now() is_multi_node = args.node_count > 1 - if (is_multi_node): - # for multiple nodes, create a file to get a unique id. - while True: - node_id = random.randint(0, 10000) - unique_check_file_name = "node_%s" % (node_id) - if not os.path.exists(unique_check_file_name): - break - with open(unique_check_file_name, "w") as unique_check_file: - unique_check_file.write("%s" % (int(datetime.now().timestamp() * 1000))) - args.node_id = node_id - args.runner_id = "%s_%s" % (args.runner_id, node_id) - - trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'runner', - StdOutputType.Stdout, args.log_collection, args.runner_id) - sys.stdout = sys.stderr = trial_runner_syslogger trial = None try: @@ -75,9 +60,6 @@ def main_loop(args): if trial is not None: trial.kill() - trial_runner_syslogger.close() - trial_runner_syslogger = None - def trial_runner_help_info(*args): print('please run --help to see guidance') @@ -96,26 +78,26 @@ def check_version(args): else: try: trial_runner_version = regular.search(trial_runner_version).group('version') - nni_log(LogType.Info, 'runner_version is {0}'.format(trial_runner_version)) + nni_log(LogType.Info, '{0}: runner_version is {1}'.format(args.runner_name, trial_runner_version)) nni_manager_version = regular.search(args.nni_manager_version).group('version') - nni_log(LogType.Info, 'nni_manager_version is {0}'.format(nni_manager_version)) + nni_log(LogType.Info, '{0}: nni_manager_version is {1}'.format(args.runner_name, nni_manager_version)) log_entry = {} if trial_runner_version != nni_manager_version: - nni_log(LogType.Error, 'Version does not match!') - error_message = 'NNIManager version is {0}, Trial runner version is {1}, NNI version does not match!'.format( - nni_manager_version, trial_runner_version) + nni_log(LogType.Error, '{0}: Version does not match!'.format(args.runner_name)) + error_message = '{0}: NNIManager version is {1}, Trial runner version is {2}, NNI version does not match!'.format( + args.runner_name, nni_manager_version, trial_runner_version) log_entry['tag'] = 'VCFail' log_entry['msg'] = error_message rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_id), json.dumps(log_entry), 10, False) os._exit(1) else: - nni_log(LogType.Info, 'Version match!') + nni_log(LogType.Info, '{0}: Version match!'.format(args.runner_name)) log_entry['tag'] = 'VCSuccess' rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_id), json.dumps(log_entry), 10, False) except AttributeError as err: - nni_log(LogType.Error, err) + nni_log(LogType.Error, '{0}: {1}'.format(args.runner_name, err)) def fetch_parameter_file(args): @@ -160,10 +142,8 @@ def run(self): args.exp_id = settings["experimentId"] args.platform = settings["platform"] - # runner_id is unique node in experiment, and will be updated if it's multi-nodes + # runner_id is unique runner in experiment, and will be updated if it's multi-nodes args.runner_id = "runner_"+os.path.basename(os.path.realpath(os.path.curdir)) - # node id is unique in the runner - args.node_id = None if args.trial_command is None: args.trial_command = settings["command"] @@ -192,16 +172,38 @@ def run(self): from .protocol import CommandType, receive from .trial import Trial - nni_log(LogType.Info, "merged args is {}".format(args)) + is_multi_node = args.node_count > 1 + + if (is_multi_node): + # for multiple nodes, create a file to get a unique id. + while True: + node_id = random.randint(0, 10000) + unique_check_file_name = "node_%s" % (node_id) + if not os.path.exists(unique_check_file_name): + break + with open(unique_check_file_name, "w") as unique_check_file: + unique_check_file.write("%s" % (int(datetime.now().timestamp() * 1000))) + args.node_id = node_id + args.runner_name = "%s_%s" % (args.runner_id, node_id) + else: + # node id is unique in the runner + args.node_id = None + # runner_name is unique node in experiment, and will be updated if it's multi-nodes + args.runner_name = args.runner_id + + trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'runner', + StdOutputType.Stdout, args.log_collection, args.runner_id) + sys.stdout = sys.stderr = trial_runner_syslogger + nni_log(LogType.Info, "{}: merged args is {}".format(args.runner_name, args)) if args.trial_command is None: - nni_log(LogType.Error, "no command is found.") + nni_log(LogType.Error, "{}: no command is found.".format(args.runner_name)) os._exit(1) check_version(args) try: main_loop(args) except SystemExit as se: - nni_log(LogType.Info, 'NNI trial runner exit with code {}'.format(se.code)) + nni_log(LogType.Info, '{}: NNI trial runner exit with code {}'.format(args.runner_name, se.code)) os._exit(se.code) finally: if trial_runner_syslogger is not None: From 92cab3a834553f9681bdcf0613d19b534b9425c6 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Fri, 5 Jun 2020 16:03:21 +0800 Subject: [PATCH 13/98] fix openpai yaml format --- .../reusable/openPaiEnvironmentService.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index 4099249ade..7414d9d83d 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -345,9 +345,11 @@ export class OpenPaiEnvironmentService implements EnvironmentService { 'com.microsoft.pai.runtimeplugin': [ { plugin: this.paiTrialConfig.paiStoragePlugin, - parameters: [ - containerPathName - ] + parameters: { + storageConfigNames: [ + containerPathName + ] + } } ], submitFrom: 'submit-job-v2' From 0674d88596b0e93bed6ae15793d84df2765eb9b2 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Fri, 5 Jun 2020 16:04:00 +0800 Subject: [PATCH 14/98] fix k8s yaml schema --- .../training_service/pai/paiK8S/paiK8STrainingService.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index d23ec3a021..6bff5897fc 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -238,9 +238,11 @@ class PAIK8STrainingService extends PAITrainingService { 'com.microsoft.pai.runtimeplugin': [ { plugin: this.paiTrialConfig.paiStoragePlugin, - parameters: [ - containerPathName - ] + parameters: { + storageConfigNames: [ + containerPathName + ] + } } ], submitFrom: 'submit-job-v2' From e5b9665a2c365a5acf3c71983c3e833806e5d310 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Fri, 5 Jun 2020 16:09:51 +0800 Subject: [PATCH 15/98] rename forward training service to router training service for better understanding. --- src/nni_manager/main.ts | 4 ++-- .../{forwardTrainingService.ts => routerTrainingService.ts} | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) rename src/nni_manager/training_service/reusable/{forwardTrainingService.ts => routerTrainingService.ts} (98%) diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index d626685a18..a999c04d5f 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -21,7 +21,7 @@ import { NNIRestServer } from './rest_server/nniRestServer'; import { FrameworkControllerTrainingService } from './training_service/kubernetes/frameworkcontroller/frameworkcontrollerTrainingService'; import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/kubeflowTrainingService'; import { LocalTrainingService } from './training_service/local/localTrainingService'; -import { ForwardTrainingService } from './training_service/reusable/forwardTrainingService'; +import { RouterTrainingService } from './training_service/reusable/routerTrainingService'; import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService'; import { RemoteMachineTrainingService @@ -47,7 +47,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN .scope(Scope.Singleton); } else if (platformMode === 'pai') { Container.bind(TrainingService) - .to(ForwardTrainingService) + .to(RouterTrainingService) .scope(Scope.Singleton); } else if (platformMode === 'paiYarn') { Container.bind(TrainingService) diff --git a/src/nni_manager/training_service/reusable/forwardTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts similarity index 98% rename from src/nni_manager/training_service/reusable/forwardTrainingService.ts rename to src/nni_manager/training_service/reusable/routerTrainingService.ts index b6944dadef..cec0918a4d 100644 --- a/src/nni_manager/training_service/reusable/forwardTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -39,7 +39,7 @@ import { MountedStorageService } from './mountedStorageService'; * The final goal is to support reusable training job in higher level than training service. */ @component.Singleton -class ForwardTrainingService implements TrainingService { +class RouterTrainingService implements TrainingService { protected readonly log!: Logger; private internalTrainingService: TrainingService | undefined; private metaDataCache: Map = new Map(); @@ -170,4 +170,4 @@ class ForwardTrainingService implements TrainingService { } } -export { ForwardTrainingService }; +export { RouterTrainingService }; From 1e626fdb510b1d6b959c3132eef02d458cd1c3b1 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 9 Jun 2020 09:48:31 +0800 Subject: [PATCH 16/98] add trialService trialService is used to support different submission types like AML. --- .../training_service/reusable/environment.ts | 27 ---- .../reusable/environmentManager.ts | 132 ++++++------------ .../reusable/routerTrainingService.ts | 6 + .../reusable/storageTrialService.ts | 123 ++++++++++++++++ .../training_service/reusable/trial.ts | 69 +++++++++ 5 files changed, 240 insertions(+), 117 deletions(-) create mode 100644 src/nni_manager/training_service/reusable/storageTrialService.ts create mode 100644 src/nni_manager/training_service/reusable/trial.ts diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 9f8d601999..42d46058eb 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -19,7 +19,6 @@ 'use strict'; -import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../common/trainingService"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; @@ -30,32 +29,6 @@ export abstract class EnvironmentService { public abstract stopEnvironment(environment: EnvironmentInformation): Promise; } -export class TrialDetail implements TrialJobDetail { - public id: string; - public status: TrialJobStatus; - public submitTime: number; - public startTime?: number; - public endTime?: number; - public tags?: string[]; - public url?: string; - public workingDirectory: string; - public form: TrialJobApplicationForm; - public isEarlyStopped?: boolean; - public environment?: EnvironmentInformation; - - public readonly TRIAL_METADATA_DIR = ".nni"; - - constructor(id: string, status: TrialJobStatus, submitTime: number, - workingDirectory: string, form: TrialJobApplicationForm) { - this.id = id; - this.status = status; - this.submitTime = submitTime; - this.workingDirectory = workingDirectory; - this.form = form; - this.tags = []; - } -} - export class RunnerSettings { public experimentId: string = ""; public platform: string = ""; diff --git a/src/nni_manager/training_service/reusable/environmentManager.ts b/src/nni_manager/training_service/reusable/environmentManager.ts index c8038a7753..f96a6c14ad 100644 --- a/src/nni_manager/training_service/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/reusable/environmentManager.ts @@ -24,17 +24,16 @@ import * as path from 'path'; import * as component from '../../common/component'; import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; -import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric } from '../../common/trainingService'; +import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, generateParamFileName, getVersion, uniqueString } from '../../common/utils'; -import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../core/commands'; -import { encodeCommand } from '../../core/ipcInterface'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { validateCodeDir } from '../common/util'; -import { EnvironmentInformation, EnvironmentService, RunnerSettings, TrialDetail } from './environment'; +import { EnvironmentInformation, EnvironmentService, RunnerSettings } from './environment'; import { JobRestServer } from './jobRestServer'; import { StorageService } from './storageService'; +import { TrialDetail, TrialService } from './trial'; /** * It uses to manage jobs on training platforms @@ -125,7 +124,8 @@ class EnvironmentManager implements TrainingService { { const environment = trial.environment; if (environment) { - await this.sendCommand(KILL_TRIAL_JOB, trialJobId, environment); + const trialService = component.get(TrialService); + await trialService.stopTrial(trial); trial.isEarlyStopped = isEarlyStopped; trial.status = trial.isEarlyStopped === true ? 'EARLY_STOPPED' : 'USER_CANCELED'; @@ -230,33 +230,6 @@ class EnvironmentManager implements TrainingService { } } - private async sendCommand(commantType: string, data: any, environment: EnvironmentInformation): Promise { - let retryCount = 10; - let fileName: string; - let filePath: string = ""; - let findingName: boolean = true; - const command = encodeCommand(commantType, JSON.stringify(data)); - const storageService = component.get(StorageService); - const commandPath = storageService.joinPath(environment.workingFolder, `commands`); - - while (findingName) { - fileName = `manager_command_${new Date().getTime()}.txt`; - filePath = storageService.joinPath(commandPath, fileName); - if (!await storageService.exists(filePath)) { - findingName = false; - break; - } - if (retryCount == 0) { - throw new Error(`EnvironmentManager retry too many times to send command!`); - } - retryCount--; - await delay(1); - } - - // prevent to have imcomplete command, so save as temp name and then rename. - await storageService.save(command.toString("utf8"), filePath); - } - private async environmentMaintenanceLoop(): Promise { const environmentService = component.get(EnvironmentService); while (!this.stopping) { @@ -289,12 +262,26 @@ class EnvironmentManager implements TrainingService { } private async trialManagementLoop(): Promise { - const storageService = component.get(StorageService); while (!this.stopping) { + await delay(2000); + + const toRefreshedTrials: TrialDetail[] = []; + for (const trial of this.trials.values()) { + if (trial.status === "RUNNING" || trial.status === "WAITING" || trial.status === "UNKNOWN") { + toRefreshedTrials.push(trial); + } + } + + if (toRefreshedTrials.length == 0) { + continue; + } + + const trialService = component.get(TrialService); + trialService.updateTrialsStatus(toRefreshedTrials); + const waitingTrials: TrialDetail[] = []; let liveTrialsCount = 0; - const trials = this.trials.values(); - for (const trial of trials) { + for (const trial of toRefreshedTrials) { const currentStatus = trial.status; switch (currentStatus) { case "RUNNING": @@ -308,61 +295,26 @@ class EnvironmentManager implements TrainingService { continue; } - let isCompleted = false; - let remoteFiles: string[] = []; - - const codeFilePath = storageService.joinPath(trial.workingDirectory, trial.TRIAL_METADATA_DIR); - remoteFiles = await storageService.listDirectory(codeFilePath); - - if (remoteFiles.length > 0) { - let latestTimestamp = 0; - let aggregatedCode = 0; - let completedCount = 0; - for (const fileName of remoteFiles) { - if (fileName.startsWith("code")) { - const fullName = storageService.joinPath(codeFilePath, fileName) - const fileContent = await storageService.readFileContent(fullName); - - const match: RegExpMatchArray | null = fileContent.trim().match(/^-?(\d+)\s+(\d+)$/); - if (match !== null) { - const { 1: code, 2: timestamp } = match; - const intCode = parseInt(code, 10) - latestTimestamp = Math.max(latestTimestamp, parseInt(timestamp, 10)); - if (intCode !== 0) { - // only save latest non-zero exit code - aggregatedCode = intCode; - } - completedCount++; - } - } + // any node exit, then make sure the whole trial stopped. + if (trial.nodeExitResults.length > 0) { + const completedCount = trial.nodeExitResults.length; + let finalStatus: TrialJobStatus = "SUCCEEDED"; + this.log.debug(`found ${completedCount} completed trial process(es), nodeCount: ${environment.nodeCount}`); + + // if some trial processes doesn't exit, kill it for next one. + // for example, in horovod, it's just sleep command, has no impact on trial result. + if (environment.nodeCount >= completedCount) { + const trialService = component.get(TrialService); + await trialService.stopTrial(trial); } - - // for multiple running nodes, if one node is done, thought it's done. - // any failed node means failed. - // use <= for some wired cases. - if (completedCount > 0) { - this.log.debug(`found ${completedCount} completed trial process(es), nodeCount: ${environment.nodeCount}`); - - // if some trial processes doesn't exit, kill it for next one. - // for example, in horovod, it's just sleep command, has no impact on trial result. - if (environment.nodeCount >= completedCount) { - this.sendCommand(KILL_TRIAL_JOB, trial.id, environment); - } - if (trial.status == currentStatus) { - // Update trial job's status based on result code - if (aggregatedCode === 0) { - trial.status = 'SUCCEEDED'; - } else { - trial.status = 'FAILED'; - } + for (const nodeStatus of trial.nodeExitResults) { + if (nodeStatus == "FAILED") { + finalStatus = "FAILED"; } - trial.endTime = latestTimestamp; - this.releaseEnvironment(trial); - isCompleted = true; } - } - - if (isCompleted === false) { + trial.status = finalStatus; + this.releaseEnvironment(trial); + } else { // check status consistence with environment. const environmentStatus = environment.status; if (environmentStatus !== "RUNNING") { @@ -409,7 +361,6 @@ class EnvironmentManager implements TrainingService { await this.requestEnvironment(); } } - await delay(2000); } } @@ -456,14 +407,15 @@ class EnvironmentManager implements TrainingService { environment.isIdle = false; trial.environment = environment; - const settings = { + trial.settings = { trialId: trial.id, sequenceId: trial.form.sequenceId, parameter: trial.form.hyperParameters, } trial.startTime = Date.now(); trial.status = "RUNNING"; - await this.sendCommand(NEW_TRIAL_JOB, settings, environment); + const trialService = component.get(TrialService); + await trialService.startTrial(trial); } private releaseEnvironment(trial: TrialDetail): void { diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index cec0918a4d..223d21a182 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -32,6 +32,8 @@ import { EnvironmentService } from './environment'; import { OpenPaiEnvironmentService } from './openPaiEnvironmentService'; import { StorageService } from './storageService'; import { MountedStorageService } from './mountedStorageService'; +import { TrialService } from './trial'; +import { StorageTrialService } from './storageTrialService'; /** @@ -120,6 +122,10 @@ class RouterTrainingService implements TrainingService { Container.bind(StorageService) .to(MountedStorageService) .scope(Scope.Singleton); + // TODO to support other trialService later. + Container.bind(TrialService) + .to(StorageTrialService) + .scope(Scope.Singleton); } else { this.log.debug(`caching metadata key:{} value:{}, as training service is not determined.`); this.internalTrainingService = component.get(PAIK8STrainingService); diff --git a/src/nni_manager/training_service/reusable/storageTrialService.ts b/src/nni_manager/training_service/reusable/storageTrialService.ts new file mode 100644 index 0000000000..474a1fa3db --- /dev/null +++ b/src/nni_manager/training_service/reusable/storageTrialService.ts @@ -0,0 +1,123 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as component from "../../common/component"; +import { delay } from "../../common/utils"; +import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../core/commands'; +import { encodeCommand } from "../../core/ipcInterface"; +import { EnvironmentInformation } from "./environment"; +import { StorageService } from "./storageService"; +import { TrialDetail, TrialService } from "./trial"; + +@component.Singleton +export class StorageTrialService extends TrialService { + public async config(_key: string, _value: string): Promise { + return; + } + + public async updateTrialsStatus(trials: TrialDetail[]): Promise { + const storageService = component.get(StorageService); + + for (const trial of trials) { + const currentStatus = trial.status; + // to prevent inconsistent status, skip all non running trials + if (currentStatus !== "RUNNING") { + continue; + } + + const environment = trial.environment; + if (environment === undefined) { + this.log.error(`found running trial ${trial.id} has no environment, set trial to UNKNOWN.`); + trial.status = "UNKNOWN"; + continue; + } + + let remoteFiles: string[] = []; + const codeFilePath = storageService.joinPath(trial.workingDirectory, trial.TRIAL_METADATA_DIR); + remoteFiles = await storageService.listDirectory(codeFilePath); + + if (remoteFiles.length > 0) { + let latestTimestamp = 0; + + trial.nodeExitResults = []; + for (const fileName of remoteFiles) { + if (fileName.startsWith("code")) { + const fullName = storageService.joinPath(codeFilePath, fileName) + const fileContent = await storageService.readFileContent(fullName); + + const match: RegExpMatchArray | null = fileContent.trim().match(/^-?(\d+)\s+(\d+)$/); + if (match !== null) { + const { 1: code, 2: timestamp } = match; + const intCode = parseInt(code, 10) + latestTimestamp = Math.max(latestTimestamp, parseInt(timestamp, 10)); + if (intCode === 0) { + trial.nodeExitResults.push("SUCCEEDED"); + } else { + trial.nodeExitResults.push("FAILED"); + } + } + } + } + } + } + } + + public async startTrial(trial: TrialDetail): Promise { + if (trial.environment === undefined) { + throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); + } + await this.sendCommand(NEW_TRIAL_JOB, trial.settings, trial.environment); + } + + public async stopTrial(trial: TrialDetail): Promise { + if (trial.environment === undefined) { + throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); + } + await this.sendCommand(KILL_TRIAL_JOB, trial.id, trial.environment); + } + + private async sendCommand(commantType: string, data: any, environment: EnvironmentInformation): Promise { + let retryCount = 10; + let fileName: string; + let filePath: string = ""; + let findingName: boolean = true; + const command = encodeCommand(commantType, JSON.stringify(data)); + const storageService = component.get(StorageService); + const commandPath = storageService.joinPath(environment.workingFolder, `commands`); + + while (findingName) { + fileName = `manager_command_${new Date().getTime()}.txt`; + filePath = storageService.joinPath(commandPath, fileName); + if (!await storageService.exists(filePath)) { + findingName = false; + break; + } + if (retryCount == 0) { + throw new Error(`EnvironmentManager retry too many times to send command!`); + } + retryCount--; + await delay(1); + } + + // prevent to have imcomplete command, so save as temp name and then rename. + await storageService.save(command.toString("utf8"), filePath); + } +} diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts new file mode 100644 index 0000000000..4f471f41c9 --- /dev/null +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -0,0 +1,69 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { Logger, getLogger } from "../../common/log"; +import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../common/trainingService"; +import { EnvironmentInformation } from "./environment"; + +export abstract class TrialService { + protected readonly log: Logger; + + public abstract config(key: string, value: string): Promise; + public abstract updateTrialsStatus(trials: TrialDetail[]): Promise; + public abstract startTrial(trial: TrialDetail): Promise; + public abstract stopTrial(trial: TrialDetail): Promise; + + constructor() { + this.log = getLogger(); + } +} + +export class TrialDetail implements TrialJobDetail { + public id: string; + public status: TrialJobStatus; + public submitTime: number; + public startTime?: number; + public endTime?: number; + public tags?: string[]; + public url?: string; + public workingDirectory: string; + public form: TrialJobApplicationForm; + public isEarlyStopped?: boolean; + public environment?: EnvironmentInformation; + + // init settings of trial + public settings = {}; + // it's used to aggregate node status for multiple node trial + public nodeExitResults: TrialJobStatus[]; + + public readonly TRIAL_METADATA_DIR = ".nni"; + + constructor(id: string, status: TrialJobStatus, submitTime: number, + workingDirectory: string, form: TrialJobApplicationForm) { + this.id = id; + this.status = status; + this.submitTime = submitTime; + this.workingDirectory = workingDirectory; + this.form = form; + this.tags = []; + this.nodeExitResults = []; + } +} From c6b606173a02e9f7c3ef2fda56cd9011396f159e Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 9 Jun 2020 11:57:02 +0800 Subject: [PATCH 17/98] not send stop for single node --- .../training_service/reusable/environmentManager.ts | 5 +++-- tools/nni_trial_tool/trial.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environmentManager.ts b/src/nni_manager/training_service/reusable/environmentManager.ts index f96a6c14ad..3af45d6d57 100644 --- a/src/nni_manager/training_service/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/reusable/environmentManager.ts @@ -299,11 +299,12 @@ class EnvironmentManager implements TrainingService { if (trial.nodeExitResults.length > 0) { const completedCount = trial.nodeExitResults.length; let finalStatus: TrialJobStatus = "SUCCEEDED"; - this.log.debug(`found ${completedCount} completed trial process(es), nodeCount: ${environment.nodeCount}`); + this.log.debug(`found ${completedCount} completed trial node(s), nodeCount: ${environment.nodeCount}`); // if some trial processes doesn't exit, kill it for next one. // for example, in horovod, it's just sleep command, has no impact on trial result. - if (environment.nodeCount >= completedCount) { + if (environment.nodeCount > completedCount) { + this.log.info(`stop partial completed trial ${trial.id}`); const trialService = component.get(TrialService); await trialService.stopTrial(trial); } diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index ff9584e850..1c9a11d0f4 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -99,7 +99,7 @@ def is_running(self): # In Windows, the retCode -1 is 4294967295. It's larger than c_long, and raise OverflowError. # So covert it to int32. retCode = ctypes.c_long(retCode).value - nni_log(LogType.Info, '{0}: subprocess terminated. Exit code is {1}. Quit'.format(self.name, retCode)) + nni_log(LogType.Info, '{0}: subprocess terminated. Exit code is {1}.'.format(self.name, retCode)) # Exit as the retCode of subprocess(trial) exit_code_file_name = os.path.join(self.trial_output_dir, "code") From b8e47beee0be60d485afd12acff32eaa5af9e981 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 16 Jun 2020 09:28:24 +0800 Subject: [PATCH 18/98] rename environmentManager to trialDispatcher TrialDispatcher is easier to understand it's purpose. --- .../training_service/reusable/routerTrainingService.ts | 4 ++-- .../reusable/{environmentManager.ts => trialDispatcher.ts} | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) rename src/nni_manager/training_service/reusable/{environmentManager.ts => trialDispatcher.ts} (99%) diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 223d21a182..8ce766224d 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -26,7 +26,7 @@ import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; import { PAIK8STrainingService } from '../pai/paiK8S/paiK8STrainingService'; -import { EnvironmentManager } from './environmentManager'; +import { TrialDispatcher } from './trialDispatcher'; import { Container, Scope } from 'typescript-ioc'; import { EnvironmentService } from './environment'; import { OpenPaiEnvironmentService } from './openPaiEnvironmentService'; @@ -112,7 +112,7 @@ class RouterTrainingService implements TrainingService { const config = JSON.parse(value); if (config.reuse === true) { this.log.info(`reuse flag enabled, use EnvironmentManager.`); - this.internalTrainingService = component.get(EnvironmentManager); + this.internalTrainingService = component.get(TrialDispatcher); // TODO to support other serivces later. Container.bind(EnvironmentService) diff --git a/src/nni_manager/training_service/reusable/environmentManager.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts similarity index 99% rename from src/nni_manager/training_service/reusable/environmentManager.ts rename to src/nni_manager/training_service/reusable/trialDispatcher.ts index 3af45d6d57..96c2a3e400 100644 --- a/src/nni_manager/training_service/reusable/environmentManager.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -40,7 +40,7 @@ import { TrialDetail, TrialService } from './trial'; * and expose trial as trial job to upper level. **/ @component.Singleton -class EnvironmentManager implements TrainingService { +class TrialDispatcher implements TrainingService { private readonly log: Logger; private stopping: boolean = false; @@ -431,4 +431,4 @@ class EnvironmentManager implements TrainingService { } } -export { EnvironmentManager }; +export { TrialDispatcher }; From 0ee933aa2eb71d5464b08c97fd0bfd3a5c21d357 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 16 Jun 2020 09:54:45 +0800 Subject: [PATCH 19/98] support no central storage service --- .../training_service/reusable/environment.ts | 5 +- .../reusable/openPaiEnvironmentService.ts | 81 ++++++++++--------- .../reusable/storageTrialService.ts | 13 ++- .../training_service/reusable/trial.ts | 3 +- .../reusable/trialDispatcher.ts | 56 ++++++------- 5 files changed, 89 insertions(+), 69 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 42d46058eb..59b38c48f4 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -23,8 +23,11 @@ export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; export abstract class EnvironmentService { + + public abstract get hasStorageService(): boolean; + public abstract config(key: string, value: string): Promise; - public abstract updateEnvironmentsStatus(environment: EnvironmentInformation[]): Promise; + public abstract refreshEnvironmentsStatus(environment: EnvironmentInformation[]): Promise; public abstract startEnvironment(environment: EnvironmentInformation): Promise; public abstract stopEnvironment(environment: EnvironmentInformation): Promise; } diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index 7414d9d83d..d28fafcc98 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -55,7 +55,49 @@ export class OpenPaiEnvironmentService implements EnvironmentService { this.experimentId = getExperimentId(); } - public async updateEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { + + public get hasStorageService(): boolean { + return true; + } + + public async config(key: string, value: string): Promise { + switch (key) { + case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: + this.paiClusterConfig = JSON.parse(value); + this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); + if (this.paiClusterConfig.passWord) { + // Get PAI authentication token + await this.updatePaiToken(); + } else if (this.paiClusterConfig.token) { + this.paiToken = this.paiClusterConfig.token; + } + break; + + case TrialConfigMetadataKey.TRIAL_CONFIG: { + if (this.paiClusterConfig === undefined) { + this.log.error('pai cluster config is not initialized'); + break; + } + this.paiTrialConfig = JSON.parse(value); + // Validate to make sure codeDir doesn't have too many files + + const storageService = component.get(StorageService); + const remoteRoot = storageService.joinPath(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId); + storageService.initialize(this.paiTrialConfig.nniManagerNFSMountPath, remoteRoot); + + if (this.paiTrialConfig.paiConfigPath) { + this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); + } + break; + } + case TrialConfigMetadataKey.MULTI_PHASE: + break; + default: + this.log.debug(`OpenPAI not proccessed metadata key: '${key}', value: '${value}'`); + } + } + + public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { const deferred: Deferred = new Deferred(); await this.refreshPlatform(); @@ -221,43 +263,6 @@ export class OpenPaiEnvironmentService implements EnvironmentService { return deferred.promise; } - public async config(key: string, value: string): Promise { - switch (key) { - case TrialConfigMetadataKey.PAI_CLUSTER_CONFIG: - this.paiClusterConfig = JSON.parse(value); - this.paiClusterConfig.host = this.formatPAIHost(this.paiClusterConfig.host); - if (this.paiClusterConfig.passWord) { - // Get PAI authentication token - await this.updatePaiToken(); - } else if (this.paiClusterConfig.token) { - this.paiToken = this.paiClusterConfig.token; - } - break; - - case TrialConfigMetadataKey.TRIAL_CONFIG: { - if (this.paiClusterConfig === undefined) { - this.log.error('pai cluster config is not initialized'); - break; - } - this.paiTrialConfig = JSON.parse(value); - // Validate to make sure codeDir doesn't have too many files - - const storageService = component.get(StorageService); - const remoteRoot = storageService.joinPath(this.paiTrialConfig.nniManagerNFSMountPath, this.experimentId); - storageService.initialize(this.paiTrialConfig.nniManagerNFSMountPath, remoteRoot); - - if (this.paiTrialConfig.paiConfigPath) { - this.paiJobConfig = yaml.safeLoad(fs.readFileSync(this.paiTrialConfig.paiConfigPath, 'utf8')); - } - break; - } - case TrialConfigMetadataKey.MULTI_PHASE: - break; - default: - this.log.debug(`OpenPAI not proccessed metadata key: '${key}', value: '${value}'`); - } - } - private async refreshPlatform(): Promise { if (this.paiClusterConfig && this.paiClusterConfig.passWord) { try { diff --git a/src/nni_manager/training_service/reusable/storageTrialService.ts b/src/nni_manager/training_service/reusable/storageTrialService.ts index 474a1fa3db..25e4651601 100644 --- a/src/nni_manager/training_service/reusable/storageTrialService.ts +++ b/src/nni_manager/training_service/reusable/storageTrialService.ts @@ -20,12 +20,13 @@ 'use strict'; import * as component from "../../common/component"; -import { delay } from "../../common/utils"; +import { delay, generateParamFileName } from "../../common/utils"; import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../core/commands'; import { encodeCommand } from "../../core/ipcInterface"; import { EnvironmentInformation } from "./environment"; import { StorageService } from "./storageService"; import { TrialDetail, TrialService } from "./trial"; +import { TrialJobApplicationForm } from "../../common/trainingService"; @component.Singleton export class StorageTrialService extends TrialService { @@ -33,7 +34,7 @@ export class StorageTrialService extends TrialService { return; } - public async updateTrialsStatus(trials: TrialDetail[]): Promise { + public async refreshTrialsStatus(trials: TrialDetail[]): Promise { const storageService = component.get(StorageService); for (const trial of trials) { @@ -94,6 +95,14 @@ export class StorageTrialService extends TrialService { await this.sendCommand(KILL_TRIAL_JOB, trial.id, trial.environment); } + public async updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise { + const storageService = component.get(StorageService); + const fileName = storageService.joinPath(trial.workingDirectory, generateParamFileName(form.hyperParameters)) + + // Write file content ( parameter.cfg ) to working folders + await storageService.save(form.hyperParameters.value, fileName); + } + private async sendCommand(commantType: string, data: any, environment: EnvironmentInformation): Promise { let retryCount = 10; let fileName: string; diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts index 4f471f41c9..2ada2cd958 100644 --- a/src/nni_manager/training_service/reusable/trial.ts +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -27,7 +27,8 @@ export abstract class TrialService { protected readonly log: Logger; public abstract config(key: string, value: string): Promise; - public abstract updateTrialsStatus(trials: TrialDetail[]): Promise; + public abstract refreshTrialsStatus(trials: TrialDetail[]): Promise; + public abstract updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise; public abstract startTrial(trial: TrialDetail): Promise; public abstract stopTrial(trial: TrialDetail): Promise; diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 96c2a3e400..e4784fa0bc 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -25,7 +25,7 @@ import * as component from '../../common/component'; import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; -import { delay, generateParamFileName, getVersion, uniqueString } from '../../common/utils'; +import { delay, getVersion, uniqueString } from '../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; @@ -107,10 +107,8 @@ class TrialDispatcher implements TrainingService { public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { const trialDetail = await this.getTrialJob(trialJobId); - const storageService = component.get(StorageService); - const fileName = storageService.joinPath(trialDetail.workingDirectory, generateParamFileName(form.hyperParameters)) - // Write file content ( parameter.cfg ) to working folders - await storageService.save(form.hyperParameters.value, fileName); + const trialService = component.get(TrialService); + await trialService.updateTrial(trialDetail, form); return trialDetail; } @@ -147,19 +145,22 @@ class TrialDispatcher implements TrainingService { throw new Error(`trial config shouldn't be undefined in run()`); } - this.log.info(`Environment Manager copying code and settings.`); - const storageService = component.get(StorageService); - // Copy the compressed file to remoteDirectory and delete it - const codeDir = path.resolve(this.trialConfig.codeDir); - const envDir = storageService.joinPath("envs"); - const codeFileName = await storageService.copyDirectory(codeDir, envDir, true); - storageService.rename(codeFileName, "nni-code.tar.gz"); - - const installFileName = storageService.joinPath(envDir, 'install_nni.sh'); - await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName); - - const runnerSettings = storageService.joinPath(envDir, "settings.json"); - await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); + const environmentService = component.get(EnvironmentService); + if (environmentService.hasStorageService) { + this.log.info(`Environment Manager copying code and settings.`); + const storageService = component.get(StorageService); + // Copy the compressed file to remoteDirectory and delete it + const codeDir = path.resolve(this.trialConfig.codeDir); + const envDir = storageService.joinPath("envs"); + const codeFileName = await storageService.copyDirectory(codeDir, envDir, true); + storageService.rename(codeFileName, "nni-code.tar.gz"); + + const installFileName = storageService.joinPath(envDir, 'install_nni.sh'); + await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName); + + const runnerSettings = storageService.joinPath(envDir, "settings.json"); + await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); + } this.log.info(`Environment Manager run loop started.`); await Promise.all([ @@ -239,7 +240,7 @@ class TrialDispatcher implements TrainingService { environments.push(environment); } }); - await environmentService.updateEnvironmentsStatus(environments); + await environmentService.refreshEnvironmentsStatus(environments); environments.forEach((environment) => { const oldIsAlive = environment.isAlive; @@ -277,7 +278,7 @@ class TrialDispatcher implements TrainingService { } const trialService = component.get(TrialService); - trialService.updateTrialsStatus(toRefreshedTrials); + trialService.refreshTrialsStatus(toRefreshedTrials); const waitingTrials: TrialDetail[] = []; let liveTrialsCount = 0; @@ -367,20 +368,21 @@ class TrialDispatcher implements TrainingService { private async requestEnvironment(): Promise { const environmentService = component.get(EnvironmentService); - const storageService = component.get(StorageService); const envId = uniqueString(5); const name = `nni_exp_${this.experimentId}_env_${envId}`; const environment = new EnvironmentInformation(envId, name); - environment.workingFolder = storageService.joinPath("envs", envId); environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; - await storageService.createDirectory(environment.workingFolder); + if (environmentService.hasStorageService) { + const storageService = component.get(StorageService); + environment.workingFolder = storageService.joinPath("envs", envId); + await storageService.createDirectory(environment.workingFolder); - const isDebuging = true; - if (isDebuging) { - // environment.status = "RUNNING"; - await storageService.copyDirectory("../nni/tools/nni_trial_tool", environment.workingFolder); + const isDebuging = true; + if (isDebuging) { + await storageService.copyDirectory("../nni/tools/nni_trial_tool", environment.workingFolder); + } } this.environments.set(environment.id, environment); From c7973bebfdb48c7cb672d16f8a125e08fbb48d1d Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 16 Jun 2020 10:43:05 +0800 Subject: [PATCH 20/98] init --- src/nni_manager/package.json | 1 + .../rest_server/restValidationSchemas.ts | 8 + .../training_service/aml/amlConfig.ts | 66 +++++ .../training_service/aml/amlJobRestServer.ts | 72 +++++ .../aml/amlTrainingService.ts | 270 ++++++++++++++++++ .../training_service/aml/jobTemplate.py | 36 +++ .../common/trialConfigMetadataKey.ts | 1 + tools/nni_cmd/config_schema.py | 20 +- tools/nni_cmd/launcher.py | 21 ++ 9 files changed, 494 insertions(+), 1 deletion(-) create mode 100644 src/nni_manager/training_service/aml/amlConfig.ts create mode 100644 src/nni_manager/training_service/aml/amlJobRestServer.ts create mode 100644 src/nni_manager/training_service/aml/amlTrainingService.ts create mode 100644 src/nni_manager/training_service/aml/jobTemplate.py diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index 34aa0b0121..8a8d69b958 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -18,6 +18,7 @@ "express-joi-validator": "^2.0.0", "js-base64": "^2.4.9", "kubernetes-client": "^6.5.0", + "python-shell": "^2.0.1", "rx": "^4.1.0", "sqlite3": "^4.0.2", "ssh2": "^0.6.1", diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 773066bfd4..a25ee8efca 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -39,6 +39,9 @@ export namespace ValidationSchemas { nniManagerNFSMountPath: joi.string().min(1), containerNFSMountPath: joi.string().min(1), paiConfigPath: joi.string(), + script: joi.string(), + computeTarget: joi.string(), + nodeCount: joi.number(), paiStorageConfigName: joi.string().min(1), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), portList: joi.array().items(joi.object({ @@ -149,6 +152,11 @@ export namespace ValidationSchemas { email: joi.string().min(1), password: joi.string().min(1) }), + aml_config: joi.object({ + subscriptionId: joi.string().min(1), + resourceGroup: joi.string().min(1), + workspaceName: joi.string().min(1) + }), nni_manager_ip: joi.object({ // eslint-disable-line @typescript-eslint/camelcase nniManagerIp: joi.string().min(1) }) diff --git a/src/nni_manager/training_service/aml/amlConfig.ts b/src/nni_manager/training_service/aml/amlConfig.ts new file mode 100644 index 0000000000..15950044c6 --- /dev/null +++ b/src/nni_manager/training_service/aml/amlConfig.ts @@ -0,0 +1,66 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; +import {TrialConfig} from '../common/trialConfig'; + +export class AMLClusterConfig { + public readonly subscriptionId: string; + public readonly resourceGroup: string; + public readonly workspaceName: string; + + constructor(subscriptionId: string, resourceGroup: string, workspaceName: string) { + this.subscriptionId = subscriptionId; + this.resourceGroup = resourceGroup; + this.workspaceName = workspaceName; + } +} + +export class AMLTrialConfig extends TrialConfig { + public readonly image: string; + public readonly script: string; + public readonly codeDir: string; + public readonly nodeCount: number; + public readonly computerTarget: string; + + constructor(codeDir: string, script: string, image: string, nodeCount: number, computerTarget: string) { + super("", codeDir, 0); + this.codeDir = codeDir; + this.script = script; + this.image = image; + this.nodeCount = nodeCount; + this.computerTarget = computerTarget; + } +} + +/** + * AML trial job detail + */ +export class AMLTrialJobDetail implements TrialJobDetail { + public id: string; + public status: TrialJobStatus; + public amlJobName: string; + public submitTime: number; + public startTime?: number; + public endTime?: number; + public tags?: string[]; + public url?: string; + public workingDirectory: string; + public form: TrialJobApplicationForm; + public logPath: string; + public isEarlyStopped?: boolean; + + constructor(id: string, status: TrialJobStatus, amlJobName: string, + submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) { + this.id = id; + this.status = status; + this.amlJobName = amlJobName; + this.submitTime = submitTime; + this.workingDirectory = workingDirectory; + this.form = form; + this.tags = []; + this.logPath = logPath; + } +} diff --git a/src/nni_manager/training_service/aml/amlJobRestServer.ts b/src/nni_manager/training_service/aml/amlJobRestServer.ts new file mode 100644 index 0000000000..0c8082a976 --- /dev/null +++ b/src/nni_manager/training_service/aml/amlJobRestServer.ts @@ -0,0 +1,72 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import { Request, Response, Router } from 'express'; +import { ClusterJobRestServer } from '../common/clusterJobRestServer'; +import { AMLTrainingService } from './amlTrainingService'; + +export interface ParameterFileMeta { + readonly experimentId: string; + readonly trialId: string; + readonly filePath: string; +} + +/** + * PAI Training service Rest server, provides rest API to support pai job metrics update + * + */ +export class PAIJobRestServer extends ClusterJobRestServer { + protected parameterFileMetaList: ParameterFileMeta[] = []; + + protected readonly paiTrainingService: PAITrainingService; + + /** + * constructor to provide NNIRestServer's own rest property, e.g. port + */ + constructor (paiTrainingService: PAITrainingService) { + super(); + this.paiTrainingService = paiTrainingService; + } + + protected handleTrialMetrics(jobId: string, metrics: any[]): void { + // Split metrics array into single metric, then emit + // Warning: If not split metrics into single ones, the behavior will be UNKNOWN + for (const singleMetric of metrics) { + this.paiTrainingService.MetricsEmitter.emit('metric', { + id : jobId, + data : singleMetric + }); + } + } + + protected createRestHandler(): Router { + const router: Router = super.createRestHandler(); + + router.post(`/parameter-file-meta`, (req: Request, res: Response) => { + try { + this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); + this.parameterFileMetaList.push(req.body); + res.send(); + } catch (err) { + this.log.error(`POST parameter-file-meta error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + router.get(`/parameter-file-meta`, (req: Request, res: Response) => { + try { + this.log.info(`GET /parameter-file-meta`); + res.send(this.parameterFileMetaList); + } catch (err) { + this.log.error(`GET parameter-file-meta error: ${err}`); + res.status(500); + res.send(err.message); + } + }); + + return router; + } +} diff --git a/src/nni_manager/training_service/aml/amlTrainingService.ts b/src/nni_manager/training_service/aml/amlTrainingService.ts new file mode 100644 index 0000000000..94142d4460 --- /dev/null +++ b/src/nni_manager/training_service/aml/amlTrainingService.ts @@ -0,0 +1,270 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. + +'use strict'; + +import * as path from 'path'; +import * as request from 'request'; +import * as component from '../../common/component'; + +import { EventEmitter } from 'events'; +import { Deferred } from 'ts-deferred'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../common/log'; +import { + NNIManagerIpConfig, TrainingService, + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric +} from '../../common/trainingService'; +import { delay } from '../../common/utils'; +import { AMLClusterConfig, AMLTrialJobDetail, AMLTrialConfig } from './amlConfig'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { execMkdir, validateCodeDir, execCopydir } from '../common/util'; +import { + generateParamFileName, + getIPV4Address, getVersion, uniqueString +} from '../../common/utils'; +import { PythonShell } from 'python-shell'; + +/** + * Training Service implementation for OpenPAI (Open Platform for AI) + * Refer https://github.com/Microsoft/pai for more info about OpenPAI + */ +@component.Singleton +abstract class AMLTrainingService implements TrainingService { + private readonly log!: Logger; + private readonly metricsEmitter: EventEmitter; + private readonly trialJobsMap: Map; + private readonly expRootDir: string; + private amlClusterConfig?: AMLClusterConfig; + private amlTrialConfig?: AMLTrialConfig; + private readonly jobQueue: string[]; + private stopping: boolean = false; + private readonly experimentId!: string; + private nniManagerIpConfig?: NNIManagerIpConfig; + private versionCheck: boolean = true; + private isMultiPhase: boolean = false; + private nniVersion?: string; + + constructor() { + this.log = getLogger(); + this.metricsEmitter = new EventEmitter(); + this.trialJobsMap = new Map(); + this.jobQueue = []; + this.expRootDir = path.join('/nni', 'experiments', getExperimentId()); + this.experimentId = getExperimentId(); + this.log.info('Construct aml training service.'); + } + + public async run(): Promise { + this.log.info('Run AML training service.'); + await Promise.all([ + this.statusCheckingLoop(), + this.submitJobLoop()]); + this.log.info('AML training service exit.'); + } + + public async setClusterMetadata(key: string, value: string): Promise { + switch (key) { + case TrialConfigMetadataKey.NNI_MANAGER_IP: + this.nniManagerIpConfig = JSON.parse(value); + break; + + case TrialConfigMetadataKey.AML_CLUSTER_CONFIG: + this.amlClusterConfig = JSON.parse(value); + break; + + case TrialConfigMetadataKey.TRIAL_CONFIG: { + if (this.amlClusterConfig === undefined) { + this.log.error('aml cluster config is not initialized'); + break; + } + this.amlTrialConfig = JSON.parse(value); + // Validate to make sure codeDir doesn't have too many files + await validateCodeDir(this.amlTrialConfig.codeDir); + break; + } + case TrialConfigMetadataKey.VERSION_CHECK: + this.versionCheck = (value === 'true' || value === 'True'); + this.nniVersion = this.versionCheck ? await getVersion() : ''; + break; + case TrialConfigMetadataKey.MULTI_PHASE: + this.isMultiPhase = (value === 'true' || value === 'True'); + break; + default: + //Reject for unknown keys + this.log.error(`Uknown key: ${key}`); + } + } + + private async submitJobLoop(): Promise { + while (!this.stopping) { + while (!this.stopping && this.jobQueue.length > 0) { + const trialJobId: string = this.jobQueue[0]; + if (await this.submitTrialJobToAML(trialJobId)) { + // Remove trial job with trialJobId from job queue + this.jobQueue.shift(); + } else { + // Break the while loop since failed to submitJob + break; + } + } + await delay(3000); + } + } + + // update trial parameters for multi-phase + public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { + const trialJobDetail: AMLTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + if (trialJobDetail === undefined) { + throw new Error(`updateTrialJob failed: ${trialJobId} not found`); + } + + return trialJobDetail; + } + + public async listTrialJobs(): Promise { + const jobs: TrialJobDetail[] = []; + + for (const key of this.trialJobsMap.keys()) { + jobs.push(await this.getTrialJob(key)); + } + + return jobs; + } + + public async submitTrialJob(form: TrialJobApplicationForm): Promise { + if (this.amlClusterConfig === undefined) { + throw new Error(`paiClusterConfig not initialized!`); + } + if (this.amlTrialConfig === undefined) { + throw new Error(`paiTrialConfig not initialized!`); + } + + this.log.info(`submitTrialJob: form: ${JSON.stringify(form)}`); + + const trialJobId: string = uniqueString(5); + //TODO: use HDFS working folder instead + const trialWorkingFolder: string = path.join(this.expRootDir, 'trials', trialJobId); + const amlJobName: string = `nni_exp_${this.experimentId}_trial_${trialJobId}`; + const logPath: string = ""; + const trialJobDetail: AMLTrialJobDetail = new AMLTrialJobDetail( + trialJobId, + 'WAITING', + amlJobName, + Date.now(), + trialWorkingFolder, + form, + logPath); + + this.trialJobsMap.set(trialJobId, trialJobDetail); + this.jobQueue.push(trialJobId); + + return trialJobDetail; + } + + public async getTrialJob(trialJobId: string): Promise { + if (this.amlClusterConfig === undefined) { + throw new Error('AML Cluster config is not initialized'); + } + + const amlTrialJob: AMLTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + + if (amlTrialJob === undefined) { + throw new Error(`trial job ${trialJobId} not found`); + } + + return amlTrialJob; + } + + public addTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + this.metricsEmitter.on('metric', listener); + } + + public removeTrialJobMetricListener(listener: (metric: TrialJobMetric) => void): void { + this.metricsEmitter.off('metric', listener); + } + + public get isMultiPhaseJobSupported(): boolean { + return true; + } + + public cancelTrialJob(trialJobId: string, isEarlyStopped: boolean = false): Promise { + const trialJobDetail: AMLTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + if (trialJobDetail === undefined) { + return Promise.reject(new Error(`cancelTrialJob: trial job id ${trialJobId} not found`)); + } + + if (this.amlClusterConfig === undefined) { + return Promise.reject(new Error('PAI Cluster config is not initialized')); + } + + return Promise.resolve(); + } + + public getClusterMetadata(_key: string): Promise { + throw new Error('Not implemented!'); + } + + public async cleanUp(): Promise { + this.log.info('Stopping AML training service...'); + this.stopping = true; + } + + public get MetricsEmitter(): EventEmitter { + return this.metricsEmitter; + } + + private async statusCheckingLoop(): Promise { + while (!this.stopping) { + + await delay(3000); + } + } + + private async submitTrialJobToAML(trialJobId: string): Promise { + const deferred: Deferred = new Deferred(); + const trialJobDetail: AMLTrialJobDetail | undefined = this.trialJobsMap.get(trialJobId); + + if (trialJobDetail === undefined) { + throw new Error(`Failed to find PAITrialJobDetail for job ${trialJobId}`); + } + + if (this.amlClusterConfig === undefined) { + throw new Error('PAI Cluster config is not initialized'); + } + if (this.amlTrialConfig === undefined) { + throw new Error('trial config is not initialized'); + } + + //Generate Job Configuration in yaml format + const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail); + this.log.debug(paiJobConfig); + // Step 2. Submit PAI job via Rest call + // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API + const submitJobRequest: request.Options = { + uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, + method: 'POST', + body: paiJobConfig, + headers: { + 'Content-Type': 'text/yaml', + Authorization: `Bearer ${this.paiToken}` + } + }; + request(submitJobRequest, (error: Error, response: request.Response, body: any) => { + if ((error !== undefined && error !== null) || response.statusCode >= 400) { + const errorMessage: string = (error !== undefined && error !== null) ? error.message : + `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${body}`; + + this.log.error(errorMessage); + trialJobDetail.status = 'FAILED'; + } else { + trialJobDetail.submitTime = Date.now(); + } + deferred.resolve(true); + }); + + return deferred.promise; + } +} + +export { AMLTrainingService }; diff --git a/src/nni_manager/training_service/aml/jobTemplate.py b/src/nni_manager/training_service/aml/jobTemplate.py new file mode 100644 index 0000000000..905d30c03b --- /dev/null +++ b/src/nni_manager/training_service/aml/jobTemplate.py @@ -0,0 +1,36 @@ +import os +from argparse import ArgumentParser +from azureml.core import Experiment, RunConfiguration, ScriptRunConfig +from azureml.core.compute import ComputeTarget +from azureml.core.run import RUNNING_STATES, RunStatus, Run +from azureml.core import Workspace +from azureml.core.conda_dependencies import CondaDependencies + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument('--subscription_id', help='the subscription id of aml') + parser.add_argument('--resource_group', help='the resource group of aml') + parser.add_argument('--workspace_name', help='the workspace name of aml') + parser.add_argument('--computer_target', help='the computer cluster name of aml') + parser.add_argument('--docker_image', help='the docker image of job') + parser.add_argument('--experiment_name', help='the experiment name') + parser.add_argument('--code_dir', help='code directory') + parser.add_argument('--script', help='script') + args = parser.parse_args() + + ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) + compute_target = ComputeTarget(workspace=ws, name=args.computer_target) + experiment = Experiment(ws, args.experiment_name) + dependencies = CondaDependencies() + dependencies.add_pip_package("azureml-sdk") + dependencies.add_pip_package("azureml") + + run_config = RunConfiguration() + run_config.environment.python.conda_dependencies = dependencies + run_config.environment.docker.enabled = True + run_config.environment.docker.base_image = args.docker_image + run_config.target = compute_target + run_config.node_count = 1 + config = ScriptRunConfig(source_directory=args.code_dir, script=args.script, run_config=run_config) + script_run = experiment.submit(config) + print(script_run.get_portal_url()) diff --git a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts index deaeea4f59..bb5afc76af 100644 --- a/src/nni_manager/training_service/common/trialConfigMetadataKey.ts +++ b/src/nni_manager/training_service/common/trialConfigMetadataKey.ts @@ -19,6 +19,7 @@ export enum TrialConfigMetadataKey { NNI_MANAGER_IP = 'nni_manager_ip', FRAMEWORKCONTROLLER_CLUSTER_CONFIG = 'frameworkcontroller_config', DLTS_CLUSTER_CONFIG = 'dlts_config', + AML_CLUSTER_CONFIG = 'aml_config', VERSION_CHECK = 'version_check', LOG_COLLECTION = 'log_collection' } diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index 9aa516beef..382dd4fe7d 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -33,7 +33,7 @@ def setPathCheck(key): Optional('maxExecDuration'): And(Regex(r'^[1-9][0-9]*[s|m|h|d]$', error='ERROR: maxExecDuration format is [digit]{s,m,h,d}')), Optional('maxTrialNum'): setNumberRange('maxTrialNum', int, 1, 99999), 'trainingServicePlatform': setChoice( - 'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts'), + 'trainingServicePlatform', 'remote', 'local', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'), Optional('searchSpacePath'): And(os.path.exists, error=SCHEMA_PATH_ERROR % 'searchSpacePath'), Optional('multiPhase'): setType('multiPhase', bool), Optional('multiThread'): setType('multiThread', bool), @@ -330,6 +330,24 @@ def setPathCheck(key): } } +aml_trial_schema = { + 'trial':{ + 'codeDir': setPathCheck('codeDir'), + 'script': setType('script', str), + 'image': setType('image', str), + 'computeTarget': setType('computeClusterName', str), + 'nodeCount': setType('nodeCount', int) + } +} + +aml_config_schema = { + 'amlConfig': { + 'subscriptionId': setType('subscriptionId', str), + 'resourceGroup': setType('resourceGroup', str), + 'workspaceName': setType('workspaceName', str), + } +} + kubeflow_trial_schema = { 'trial':{ 'codeDir': setPathCheck('codeDir'), diff --git a/tools/nni_cmd/launcher.py b/tools/nni_cmd/launcher.py index 6ea6eb348f..57eb1c29b0 100644 --- a/tools/nni_cmd/launcher.py +++ b/tools/nni_cmd/launcher.py @@ -268,6 +268,25 @@ def set_dlts_config(experiment_config, port, config_file_name): #set trial_config return set_trial_config(experiment_config, port, config_file_name), err_message +def set_aml_config(experiment_config, port, config_file_name): + '''set aml configuration''' + aml_config_data = dict() + aml_config_data['aml_config'] = experiment_config['amlConfig'] + response = rest_put(cluster_metadata_url(port), json.dumps(aml_config_data), REST_TIME_OUT) + err_message = None + if not response or not response.status_code == 200: + if response is not None: + err_message = response.text + _, stderr_full_path = get_log_path(config_file_name) + with open(stderr_full_path, 'a+') as fout: + fout.write(json.dumps(json.loads(err_message), indent=4, sort_keys=True, separators=(',', ':'))) + return False, err_message + result, message = setNNIManagerIp(experiment_config, port, config_file_name) + if not result: + return result, message + #set trial_config + return set_trial_config(experiment_config, port, config_file_name), err_message + def set_experiment(experiment_config, mode, port, config_file_name): '''Call startExperiment (rest POST /experiment) with yaml file content''' request_data = dict() @@ -370,6 +389,8 @@ def set_platform_config(platform, experiment_config, port, config_file_name, res config_result, err_msg = set_frameworkcontroller_config(experiment_config, port, config_file_name) elif platform == 'dlts': config_result, err_msg = set_dlts_config(experiment_config, port, config_file_name) + elif platform == 'aml': + config_result, err_msg = set_aml_config(experiment_config, port, config_file_name) else: raise Exception(ERROR_INFO % 'Unsupported platform!') exit(1) From c09405701fb719c242ff754e8e4d5731765d607e Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 16 Jun 2020 12:56:08 +0800 Subject: [PATCH 21/98] improve delopment support --- .../reusable/trialDispatcher.ts | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index e4784fa0bc..e017eaaeb0 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -20,12 +20,13 @@ 'use strict'; import { EventEmitter } from 'events'; +import * as fs from 'fs'; import * as path from 'path'; import * as component from '../../common/component'; import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; -import { delay, getVersion, uniqueString } from '../../common/utils'; +import { delay, getLogLevel, getVersion, uniqueString } from '../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; @@ -43,6 +44,7 @@ import { TrialDetail, TrialService } from './trial'; class TrialDispatcher implements TrainingService { private readonly log: Logger; + private readonly isDeveloping: boolean = false; private stopping: boolean = false; private jobRestServer: JobRestServer; @@ -66,6 +68,14 @@ class TrialDispatcher implements TrainingService { this.runnerSettings = new RunnerSettings(); this.runnerSettings.experimentId = this.experimentId; this.runnerSettings.platform = getPlatform(); + + const logLevel = getLogLevel(); + + this.log.debug(`current folder ${__dirname}`); + if (logLevel == "debug" && fs.existsSync("../../../src/nni_manager")) { + this.log.debug("log level is debug, and exist code folder, so set to developing mode."); + this.isDeveloping = true; + } } public async listTrialJobs(): Promise { @@ -92,10 +102,14 @@ class TrialDispatcher implements TrainingService { throw new Error(`trialConfig not initialized!`); } - const storageService = component.get(StorageService); const trialId: string = uniqueString(5); - const trialWorkingFolder: string = storageService.joinPath('trials', trialId); + const environmentService = component.get(EnvironmentService); + let trialWorkingFolder: string = ""; + if (environmentService.hasStorageService) { + const storageService = component.get(StorageService); + trialWorkingFolder = storageService.joinPath('trials', trialId); + } const trialJobDetail: TrialDetail = new TrialDetail(trialId, "WAITING", Date.now(), trialWorkingFolder, form); this.trials.set(trialId, trialJobDetail); @@ -160,6 +174,10 @@ class TrialDispatcher implements TrainingService { const runnerSettings = storageService.joinPath(envDir, "settings.json"); await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); + + if (this.isDeveloping) { + await storageService.copyDirectory("../nni/tools/nni_trial_tool", envDir, true); + } } this.log.info(`Environment Manager run loop started.`); @@ -374,15 +392,14 @@ class TrialDispatcher implements TrainingService { environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; + if (this.isDeveloping) { + environment.command = "mkdir ./nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool &&" + environment.command; + } + if (environmentService.hasStorageService) { const storageService = component.get(StorageService); environment.workingFolder = storageService.joinPath("envs", envId); await storageService.createDirectory(environment.workingFolder); - - const isDebuging = true; - if (isDebuging) { - await storageService.copyDirectory("../nni/tools/nni_trial_tool", environment.workingFolder); - } } this.environments.set(environment.id, environment); From d0b250413238032229802ba85fe182b0f5ab60a0 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 16 Jun 2020 13:07:37 +0800 Subject: [PATCH 22/98] use latest storage component --- .../pai/paiK8S/paiK8STrainingService.ts | 2 -- .../reusable/openPaiEnvironmentService.ts | 11 ++--------- 2 files changed, 2 insertions(+), 11 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index e9c61be856..8d8560e9d2 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -201,8 +201,6 @@ class PAIK8STrainingService extends PAITrainingService { } } else { - const containerPathParts = this.paiTrialConfig.containerNFSMountPath.split("/"); - const containerPathName = containerPathParts[containerPathParts.length - 1]; nniJobConfig = { protocolVersion: 2, name: jobName, diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts index d28fafcc98..174f77a717 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts @@ -313,8 +313,6 @@ export class OpenPaiEnvironmentService implements EnvironmentService { } } else { - const containerPathParts = this.paiTrialConfig.containerNFSMountPath.split("/"); - const containerPathName = containerPathParts[containerPathParts.length - 1]; nniJobConfig = { protocolVersion: 2, name: jobName, @@ -347,14 +345,9 @@ export class OpenPaiEnvironmentService implements EnvironmentService { } }, extras: { - 'com.microsoft.pai.runtimeplugin': [ + 'storages': [ { - plugin: this.paiTrialConfig.paiStoragePlugin, - parameters: { - storageConfigNames: [ - containerPathName - ] - } + name: this.paiTrialConfig.paiStorageConfigName } ], submitFrom: 'submit-job-v2' From c8d4696734032897b71421da0034f5b13c752879 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 16 Jun 2020 13:18:29 +0800 Subject: [PATCH 23/98] add gpu info --- src/nni_manager/training_service/reusable/environment.ts | 4 ++++ src/nni_manager/training_service/reusable/trial.ts | 3 +++ 2 files changed, 7 insertions(+) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 59b38c48f4..ed31313169 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -19,6 +19,8 @@ 'use strict'; +import { GPUSummary } from "training_service/common/gpuData"; + export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; @@ -63,6 +65,8 @@ export class EnvironmentInformation { public command: string = ""; public nodeCount: number = 1; + public gpuSummary: GPUSummary | undefined; + constructor(id: string, jobName: string, jobId?: string) { this.id = id; this.jobName = jobName; diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts index 2ada2cd958..4f4a70fe25 100644 --- a/src/nni_manager/training_service/reusable/trial.ts +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -22,6 +22,7 @@ import { Logger, getLogger } from "../../common/log"; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../common/trainingService"; import { EnvironmentInformation } from "./environment"; +import { GPUInfo } from "training_service/common/gpuData"; export abstract class TrialService { protected readonly log: Logger; @@ -54,6 +55,8 @@ export class TrialDetail implements TrialJobDetail { public settings = {}; // it's used to aggregate node status for multiple node trial public nodeExitResults: TrialJobStatus[]; + // assigned GPUs for multi-trial scheduled. + public assignedGpus: GPUInfo[] = []; public readonly TRIAL_METADATA_DIR = ".nni"; From 1a9f19f5490fc1d1525a7c954b188b0d51d82719 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 16 Jun 2020 17:09:43 +0800 Subject: [PATCH 24/98] work version --- .../aml/jobSubmission.py} | 5 ++ src/nni_manager/main.ts | 9 ++- .../rest_server/restValidationSchemas.ts | 2 +- .../training_service/aml/amlJobRestServer.ts | 72 ------------------- .../aml/amlTrainingService.ts | 51 +++++++------ tools/nni_cmd/config_schema.py | 4 +- tools/nni_cmd/launcher_utils.py | 5 +- 7 files changed, 43 insertions(+), 105 deletions(-) rename src/nni_manager/{training_service/aml/jobTemplate.py => config/aml/jobSubmission.py} (93%) delete mode 100644 src/nni_manager/training_service/aml/amlJobRestServer.ts diff --git a/src/nni_manager/training_service/aml/jobTemplate.py b/src/nni_manager/config/aml/jobSubmission.py similarity index 93% rename from src/nni_manager/training_service/aml/jobTemplate.py rename to src/nni_manager/config/aml/jobSubmission.py index 905d30c03b..cd6fbaa6a9 100644 --- a/src/nni_manager/training_service/aml/jobTemplate.py +++ b/src/nni_manager/config/aml/jobSubmission.py @@ -1,4 +1,5 @@ import os +import time from argparse import ArgumentParser from azureml.core import Experiment, RunConfiguration, ScriptRunConfig from azureml.core.compute import ComputeTarget @@ -34,3 +35,7 @@ config = ScriptRunConfig(source_directory=args.code_dir, script=args.script, run_config=run_config) script_run = experiment.submit(config) print(script_run.get_portal_url()) + while True: + time.sleep(5) + print(script_run.get_status()) + print(script_run.get_metrics()) diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 66dd3fce0b..d45d9e7f44 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -23,6 +23,7 @@ import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/ import { LocalTrainingService } from './training_service/local/localTrainingService'; import { PAIK8STrainingService } from './training_service/pai/paiK8S/paiK8STrainingService'; import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService'; +import { AMLTrainingService } from './training_service/aml/amlTrainingService'; import { RemoteMachineTrainingService } from './training_service/remote_machine/remoteMachineTrainingService'; @@ -65,6 +66,10 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN Container.bind(TrainingService) .to(DLTSTrainingService) .scope(Scope.Singleton); + } else if (platformMode === 'aml') { + Container.bind(TrainingService) + .to(AMLTrainingService) + .scope(Scope.Singleton); } else { throw new Error(`Error: unsupported mode: ${platformMode}`); } @@ -93,7 +98,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN function usage(): void { console.info('usage: node main.js --port --mode \ - --start_mode --experiment_id --foreground '); + --start_mode --experiment_id --foreground '); } const strPort: string = parseArg(['--port', '-p']); @@ -113,7 +118,7 @@ const foreground: boolean = foregroundArg.toLowerCase() === 'true' ? true : fals const port: number = parseInt(strPort, 10); const mode: string = parseArg(['--mode', '-m']); -if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts'].includes(mode)) { +if (!['local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'].includes(mode)) { console.log(`FATAL: unknown mode: ${mode}`); usage(); process.exit(1); diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index a25ee8efca..8299a71e16 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -40,7 +40,7 @@ export namespace ValidationSchemas { containerNFSMountPath: joi.string().min(1), paiConfigPath: joi.string(), script: joi.string(), - computeTarget: joi.string(), + computerTarget: joi.string(), nodeCount: joi.number(), paiStorageConfigName: joi.string().min(1), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), diff --git a/src/nni_manager/training_service/aml/amlJobRestServer.ts b/src/nni_manager/training_service/aml/amlJobRestServer.ts deleted file mode 100644 index 0c8082a976..0000000000 --- a/src/nni_manager/training_service/aml/amlJobRestServer.ts +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -'use strict'; - -import { Request, Response, Router } from 'express'; -import { ClusterJobRestServer } from '../common/clusterJobRestServer'; -import { AMLTrainingService } from './amlTrainingService'; - -export interface ParameterFileMeta { - readonly experimentId: string; - readonly trialId: string; - readonly filePath: string; -} - -/** - * PAI Training service Rest server, provides rest API to support pai job metrics update - * - */ -export class PAIJobRestServer extends ClusterJobRestServer { - protected parameterFileMetaList: ParameterFileMeta[] = []; - - protected readonly paiTrainingService: PAITrainingService; - - /** - * constructor to provide NNIRestServer's own rest property, e.g. port - */ - constructor (paiTrainingService: PAITrainingService) { - super(); - this.paiTrainingService = paiTrainingService; - } - - protected handleTrialMetrics(jobId: string, metrics: any[]): void { - // Split metrics array into single metric, then emit - // Warning: If not split metrics into single ones, the behavior will be UNKNOWN - for (const singleMetric of metrics) { - this.paiTrainingService.MetricsEmitter.emit('metric', { - id : jobId, - data : singleMetric - }); - } - } - - protected createRestHandler(): Router { - const router: Router = super.createRestHandler(); - - router.post(`/parameter-file-meta`, (req: Request, res: Response) => { - try { - this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); - this.parameterFileMetaList.push(req.body); - res.send(); - } catch (err) { - this.log.error(`POST parameter-file-meta error: ${err}`); - res.status(500); - res.send(err.message); - } - }); - - router.get(`/parameter-file-meta`, (req: Request, res: Response) => { - try { - this.log.info(`GET /parameter-file-meta`); - res.send(this.parameterFileMetaList); - } catch (err) { - this.log.error(`GET parameter-file-meta error: ${err}`); - res.status(500); - res.send(err.message); - } - }); - - return router; - } -} diff --git a/src/nni_manager/training_service/aml/amlTrainingService.ts b/src/nni_manager/training_service/aml/amlTrainingService.ts index 94142d4460..9c84f8115d 100644 --- a/src/nni_manager/training_service/aml/amlTrainingService.ts +++ b/src/nni_manager/training_service/aml/amlTrainingService.ts @@ -235,34 +235,31 @@ abstract class AMLTrainingService implements TrainingService { if (this.amlTrialConfig === undefined) { throw new Error('trial config is not initialized'); } - - //Generate Job Configuration in yaml format - const paiJobConfig = this.generateJobConfigInYamlFormat(trialJobDetail); - this.log.debug(paiJobConfig); - // Step 2. Submit PAI job via Rest call - // Refer https://github.com/Microsoft/pai/blob/master/docs/rest-server/API.md for more detail about PAI Rest API - const submitJobRequest: request.Options = { - uri: `${this.protocol}://${this.paiClusterConfig.host}/rest-server/api/v2/jobs`, - method: 'POST', - body: paiJobConfig, - headers: { - 'Content-Type': 'text/yaml', - Authorization: `Bearer ${this.paiToken}` - } - }; - request(submitJobRequest, (error: Error, response: request.Response, body: any) => { - if ((error !== undefined && error !== null) || response.statusCode >= 400) { - const errorMessage: string = (error !== undefined && error !== null) ? error.message : - `Submit trial ${trialJobId} failed, http code:${response.statusCode}, http body: ${body}`; - - this.log.error(errorMessage); - trialJobDetail.status = 'FAILED'; - } else { - trialJobDetail.submitTime = Date.now(); - } - deferred.resolve(true); + let pyshell = new PythonShell('jobSubmission.py', { + scriptPath: './config/aml', + args: [ + '--subscription_id', this.amlClusterConfig.subscriptionId, + '--resource_group', this.amlClusterConfig.resourceGroup, + '--workspace_name', this.amlClusterConfig.workspaceName, + '--computer_target', this.amlTrialConfig.computerTarget, + '--docker_image', this.amlTrialConfig.image, + '--experiment_name', this.experimentId, + '--code_dir', this.amlTrialConfig.codeDir, + '--script', this.amlTrialConfig.script + ] + }); + pyshell.on('message', function (message) { + // received a message sent from the Python script (a simple "print" statement) + console.log(message); + }); + // end the input stream and allow the process to exit + pyshell.end(function (err,code,signal) { + if (err) throw err; + console.log('The exit code was: ' + code); + console.log('The exit signal was: ' + signal); + console.log('finished'); + deferred.resolve(); }); - return deferred.promise; } } diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index 382dd4fe7d..145da32cb3 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -335,7 +335,7 @@ def setPathCheck(key): 'codeDir': setPathCheck('codeDir'), 'script': setType('script', str), 'image': setType('image', str), - 'computeTarget': setType('computeClusterName', str), + 'computerTarget': setType('computerTarget', str), 'nodeCount': setType('nodeCount', int) } } @@ -494,3 +494,5 @@ def setPathCheck(key): KUBEFLOW_CONFIG_SCHEMA = Schema({**common_schema, **kubeflow_trial_schema, **kubeflow_config_schema}) FRAMEWORKCONTROLLER_CONFIG_SCHEMA = Schema({**common_schema, **frameworkcontroller_trial_schema, **frameworkcontroller_config_schema}) + +AML_CONFIG_SCHEMA = Schema({**common_schema, **aml_trial_schema, **aml_config_schema}) \ No newline at end of file diff --git a/tools/nni_cmd/launcher_utils.py b/tools/nni_cmd/launcher_utils.py index 5fbd9bf176..2e4c5c171a 100644 --- a/tools/nni_cmd/launcher_utils.py +++ b/tools/nni_cmd/launcher_utils.py @@ -6,7 +6,7 @@ from schema import SchemaError from schema import Schema from .config_schema import LOCAL_CONFIG_SCHEMA, REMOTE_CONFIG_SCHEMA, PAI_CONFIG_SCHEMA, PAI_YARN_CONFIG_SCHEMA, \ - DLTS_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA, FRAMEWORKCONTROLLER_CONFIG_SCHEMA, \ + DLTS_CONFIG_SCHEMA, KUBEFLOW_CONFIG_SCHEMA, FRAMEWORKCONTROLLER_CONFIG_SCHEMA, AML_CONFIG_SCHEMA, \ tuner_schema_dict, advisor_schema_dict, assessor_schema_dict from .common_utils import print_error, print_warning, print_normal, get_yml_content @@ -149,7 +149,7 @@ def validate_common_content(experiment_config): '''Validate whether the common values in experiment_config is valid''' if not experiment_config.get('trainingServicePlatform') or \ experiment_config.get('trainingServicePlatform') not in [ - 'local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts' + 'local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml' ]: print_error('Please set correct trainingServicePlatform!') exit(1) @@ -161,6 +161,7 @@ def validate_common_content(experiment_config): 'kubeflow': KUBEFLOW_CONFIG_SCHEMA, 'frameworkcontroller': FRAMEWORKCONTROLLER_CONFIG_SCHEMA, 'dlts': DLTS_CONFIG_SCHEMA, + 'aml': AML_CONFIG_SCHEMA, } separate_schema_dict = { 'tuner': tuner_schema_dict, From 3f4c177649e368cf1ef88eeecdeb3a68003df795 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 16 Jun 2020 17:19:46 +0800 Subject: [PATCH 25/98] separate channel and add gpu collector in runner --- .../training_service/reusable/environment.ts | 5 + .../reusable/trialDispatcher.ts | 13 +- tools/nni_trial_tool/base_channel.py | 118 ++++++++++++++++++ tools/nni_trial_tool/file_channel.py | 58 +++++++++ tools/nni_trial_tool/gpu.py | 60 +++++++++ tools/nni_trial_tool/protocol.py | 95 -------------- tools/nni_trial_tool/trial_runner.py | 42 +++++-- 7 files changed, 281 insertions(+), 110 deletions(-) create mode 100644 tools/nni_trial_tool/base_channel.py create mode 100644 tools/nni_trial_tool/file_channel.py create mode 100644 tools/nni_trial_tool/gpu.py delete mode 100644 tools/nni_trial_tool/protocol.py diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index ed31313169..0f9b42c1c6 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -42,6 +42,11 @@ export class RunnerSettings { public nniManagerVersion: string = ""; public logCollection: string = "none"; public command: string = ""; + public enableGpuCollector: boolean = false; + + // specify which communication channel is used by runner. + // supported channel includes: api, storage, aml + public commandChannel: string = "api"; } export class EnvironmentInformation { diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index e017eaaeb0..dcab057343 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -75,6 +75,7 @@ class TrialDispatcher implements TrainingService { if (logLevel == "debug" && fs.existsSync("../../../src/nni_manager")) { this.log.debug("log level is debug, and exist code folder, so set to developing mode."); this.isDeveloping = true; + this.runnerSettings.enableGpuCollector = true; } } @@ -313,6 +314,7 @@ class TrialDispatcher implements TrainingService { liveTrialsCount++; continue; } + const environmentStatus = environment.status; // any node exit, then make sure the whole trial stopped. if (trial.nodeExitResults.length > 0) { @@ -334,14 +336,11 @@ class TrialDispatcher implements TrainingService { } trial.status = finalStatus; this.releaseEnvironment(trial); + } else if (environmentStatus !== "RUNNING") { + this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environmentStatus}', set trial to environment status.`); + this.releaseEnvironment(trial); + trial.status = environmentStatus; } else { - // check status consistence with environment. - const environmentStatus = environment.status; - if (environmentStatus !== "RUNNING") { - this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environmentStatus}', set trial to environment status.`); - this.releaseEnvironment(trial); - trial.status = environmentStatus; - } liveTrialsCount++; } } diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py new file mode 100644 index 0000000000..9f7e5431fa --- /dev/null +++ b/tools/nni_trial_tool/base_channel.py @@ -0,0 +1,118 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import time +from abc import ABC, abstractmethod +from enum import Enum +from queue import Queue, Empty +import threading + +from .log_utils import LogType, nni_log + + +class CommandType(Enum): + Initialize = b'IN' + RequestTrialJobs = b'GE' + ReportMetricData = b'ME' + ReportGpuInfo = b'GP' + UpdateSearchSpace = b'SS' + ImportData = b'FD' + AddCustomizedTrialJob = b'AD' + TrialEnd = b'EN' + Terminate = b'TE' + Ping = b'PI' + + Initialized = b'ID' + NewTrialJob = b'TR' + SendTrialJobParameter = b'SP' + NoMoreTrialJobs = b'NO' + KillTrialJob = b'KI' + + +class BaseChannel(ABC): + def __init__(self, args): + self.is_keep_parsed = args.node_count > 1 + self.args = args + + # initialize receive, send threads. + self.is_running = True + self.receive_queue = Queue() + self.receive_thread = threading.Thread(target=self._receive_loop) + self.receive_thread.start() + self.send_queue = Queue() + self.send_thread = threading.Thread(target=self._send_loop) + self.send_thread.start() + + @abstractmethod + def _inner_send(self, message): + pass + + @abstractmethod + def _inner_receive(self): + return [] + + def _receive_loop(self): + while (self.is_running): + messages = self._inner_receive() + if messages is not None: + for message in messages: + self.receive_queue.put(message) + time.sleep(0.5) + + def _send_loop(self): + while (self.is_running): + try: + # no sleep, since it's a block call with 1 second timeout + message = self.send_queue.get(True, 1) + if message is not None: + nni_log(LogType.Info, 'Sending command, data: [%s]' % message) + self._inner_send(message) + except Empty: + # do nothing, if no command received. + pass + + def close(self): + self.is_running = False + + def send(self, command, data): + """Send command to Training Service. + command: CommandType object. + data: string payload. + the message is sent synchronized. + """ + data = json.dumps(data) + data = data.encode('utf8') + message = b'%b%014d%b' % (command.value, len(data), data) + self.send_queue.put(message) + + def receive(self): + """Receive a command from Training Service. + Returns a tuple of command (CommandType) and payload (str) + """ + command = None + data = None + + try: + command_content = self.receive_queue.get(False) + if command_content is not None: + if (len(command_content) < 16): + # invalid header + nni_log(LogType.Error, 'incorrect command is found, command must be greater than 16 bytes!') + return None, None + header = command_content[:16] + nni_log(LogType.Info, 'Received command, header: [%s]' % header) + command = CommandType(header[:2]) + length = int(header[2:]) + if (len(command_content)-16 != length): + nni_log(LogType.Error, 'incorrect command length, length {}, actual data length is {}.'.format(length, len(command)-16)) + return None, None + data = command_content[16:16+length] + data = json.loads(data.decode('utf8')) + nni_log(LogType.Info, 'Received command, data: [%s]' % data) + except Empty: + # do nothing, if no command received. + pass + except Exception as identifier: + nni_log(LogType.Error, 'meet unhandled exception in base_channel: %s' % identifier) + return command, data diff --git a/tools/nni_trial_tool/file_channel.py b/tools/nni_trial_tool/file_channel.py new file mode 100644 index 0000000000..a2fcb4f0e0 --- /dev/null +++ b/tools/nni_trial_tool/file_channel.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os +import time +from datetime import datetime + +from .base_channel import BaseChannel +from .log_utils import LogType, nni_log + +command_path = "./commands" +runner_command_prefix = "runner_command_" +manager_command_prefix = "manager_command_" + +class FileChannel(BaseChannel): + + def __init__(self, args): + super(FileChannel, self).__init__(args) + self.parsed_commands = set() + + def _inner_send(self, message): + if not os.path.exists(command_path): + os.makedirs(command_path, exist_ok=True) + while True: + file_name = os.path.join(command_path, "%s%s.txt" % ( + runner_command_prefix, int(datetime.now().timestamp() * 1000))) + if not os.path.exists(file_name): + break + time.sleep(0.01) + with open(file_name, "wb") as out_file: + out_file.write(message) + + def _inner_receive(self): + messages = [] + + pending_commands = [] + if os.path.exists(command_path): + command_files = os.listdir(command_path) + for file_name in command_files: + if (file_name.startswith(manager_command_prefix)) and file_name not in self.parsed_commands: + pending_commands.append(file_name) + pending_commands.sort() + + for file_name in pending_commands: + full_file_name = os.path.join(command_path, file_name) + with open(full_file_name, "rb") as in_file: + header = in_file.read(16) + if header is None or len(header) < 16: + # invalid header + nni_log(LogType.Error, 'incorrect command is found!') + return None + length = int(header[2:]) + data = in_file.read(length) + messages.append(header + data) + if not self.is_keep_parsed: + os.remove(full_file_name) + self.parsed_commands.add(file_name) + return messages diff --git a/tools/nni_trial_tool/gpu.py b/tools/nni_trial_tool/gpu.py new file mode 100644 index 0000000000..0517763710 --- /dev/null +++ b/tools/nni_trial_tool/gpu.py @@ -0,0 +1,60 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import subprocess +import time +import traceback +from xml.dom import minidom + + +def collect_gpu_usage(node_id): + cmd = 'nvidia-smi -q -x'.split() + info = None + try: + smi_output = subprocess.check_output(cmd) + info = parse_nvidia_smi_result(smi_output) + except Exception: + traceback.print_exc() + info = gen_empty_gpu_metric() + if node_id is not None: + info["node"] = node_id + return info + + +def parse_nvidia_smi_result(smi): + output = {} + try: + xmldoc = minidom.parseString(smi) + gpuList = xmldoc.getElementsByTagName('gpu') + output["Timestamp"] = time.asctime(time.localtime()) + output["gpuCount"] = len(gpuList) + output["gpuInfos"] = [] + for gpuIndex, gpu in enumerate(gpuList): + gpuInfo = {} + gpuInfo['index'] = gpuIndex + gpuInfo['gpuUtil'] = gpu.getElementsByTagName('utilization')[0]\ + .getElementsByTagName('gpu_util')[0]\ + .childNodes[0].data.replace("%", "").strip() + gpuInfo['gpuMemUtil'] = gpu.getElementsByTagName('utilization')[0]\ + .getElementsByTagName('memory_util')[0]\ + .childNodes[0].data.replace("%", "").strip() + processes = gpu.getElementsByTagName('processes') + runningProNumber = len(processes[0].getElementsByTagName('process_info')) + gpuInfo['activeProcessNum'] = runningProNumber + + output["gpuInfos"].append(gpuInfo) + except Exception: + # e_info = sys.exc_info() + traceback.print_exc() + return output + + +def gen_empty_gpu_metric(): + output = {} + try: + output["Timestamp"] = time.asctime(time.localtime()) + output["gpuCount"] = 0 + output["gpuInfos"] = [] + except Exception: + traceback.print_exc() + return output diff --git a/tools/nni_trial_tool/protocol.py b/tools/nni_trial_tool/protocol.py deleted file mode 100644 index 604d81c042..0000000000 --- a/tools/nni_trial_tool/protocol.py +++ /dev/null @@ -1,95 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT license. - -import json -import os -import time -from datetime import datetime -from enum import Enum - -from .log_utils import LogType, nni_log - -command_path = "./commands" -runner_command_prefix = "runner_command_" -manager_command_prefix = "manager_command_" - -parsed_commands = set() - - -class CommandType(Enum): - Initialize = b'IN' - RequestTrialJobs = b'GE' - ReportMetricData = b'ME' - UpdateSearchSpace = b'SS' - ImportData = b'FD' - AddCustomizedTrialJob = b'AD' - TrialEnd = b'EN' - Terminate = b'TE' - Ping = b'PI' - - Initialized = b'ID' - NewTrialJob = b'TR' - SendTrialJobParameter = b'SP' - NoMoreTrialJobs = b'NO' - KillTrialJob = b'KI' - - -def send(command, data): - """Send command to Training Service. - command: CommandType object. - data: string payload. - """ - - if not os.path.exists(command_path): - os.makedirs(command_path) - while True: - file_name = os.join(command_path, "%s%s.txt" % ( - runner_command_prefix, int(datetime.now().timestamp * 1000))) - if (os.path.exists(file_name)): - time.sleep(0.01) - continue - with open(file_name, "wb") as out_file: - data = json.dumps(data) - data = data.encode('utf8') - msg = b'%b%014d%b' % (command.value, len(data), data) - nni_log(LogType.Info, 'Sending command, data: [%s]' % msg) - out_file.write(msg) - break - - -def receive(is_keep_parsed=True): - """Receive a command from Training Service. - Returns a tuple of command (CommandType) and payload (str) - """ - command = None - data = None - - try: - pending_commands = [] - if os.path.exists(command_path): - command_files = os.listdir(command_path) - for file_name in command_files: - if (file_name.startswith(manager_command_prefix)) and file_name not in parsed_commands: - pending_commands.append(file_name) - pending_commands.sort() - - for file_name in pending_commands: - full_file_name = os.path.join(command_path, file_name) - with open(full_file_name, "rb") as _in_file: - header = _in_file.read(16) - nni_log(LogType.Info, 'Received command, header: [%s]' % header) - if header is None or len(header) < 16: - # invalid header - nni_log(LogType.Error, 'incorrect command is found!') - return None, None - length = int(header[2:]) - data = _in_file.read(length) - command = CommandType(header[:2]) - data = json.loads(data.decode('utf8')) - nni_log(LogType.Info, 'Received command, data: [%s]' % data) - if not is_keep_parsed: - os.remove(full_file_name) - parsed_commands.add(file_name) - except Exception as identifier: - nni_log(LogType.Error, 'meet unhandled exception: %s' % identifier) - return command, data diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 8bc00b5076..e4bcb592f4 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -9,11 +9,15 @@ import sys import threading import time -from datetime import datetime +import traceback +from datetime import datetime, timedelta import pkg_resources +from .gpu import collect_gpu_usage + idle_timeout_seconds = 10 * 60 +gpu_refressh_interval_seconds = 5 regular = re.compile('v?(?P[0-9](\.[0-9]){0,1}).*') trial_runner_syslogger = None @@ -21,14 +25,22 @@ def main_loop(args): '''main loop logic for trial runner''' idle_last_time = datetime.now() - is_multi_node = args.node_count > 1 + gpu_refresh_last_time = datetime.now() - timedelta(minutes=1) + + # init command channel + command_channel = None + if args.command_channel == "api": + command_channel = FileChannel(args) + else: + command_channel = FileChannel(args) + nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) trial = None try: # command loop while True: - command_type, command_data = receive(is_multi_node) + command_type, command_data = command_channel.receive() if command_type == CommandType.NewTrialJob: if trial is not None: if trial.is_running(): @@ -52,13 +64,20 @@ def main_loop(args): nni_log(LogType.Info, "trial runner is idle more than {0} seconds, so exit.".format( idle_timeout_seconds)) break + + if args.enable_gpu_collect and (datetime.now() - gpu_refresh_last_time).seconds > gpu_refressh_interval_seconds: + # collect gpu information + gpu_info = collect_gpu_usage(args.node_id) + command_channel.send(CommandType.ReportGpuInfo, gpu_info) + gpu_refresh_last_time = datetime.now() time.sleep(0.5) - except Exception as ex: - nni_log(LogType.Error, ex) + except Exception: + traceback.print_exc() finally: nni_log(LogType.Info, "main_loop exits.") if trial is not None: trial.kill() + command_channel.close() def trial_runner_help_info(*args): @@ -134,16 +153,22 @@ def run(self): PARSER.add_argument('--node_count', type=int, help='number of nodes, it determines how to consume command and save code file') args, unknown = PARSER.parse_known_args() - setting_file = "../settings.json" + setting_file = "settings.json" + if not os.path.exists(setting_file): + setting_file = "../{}".format(setting_file) if os.path.exists(setting_file): with open(setting_file, 'r') as fp: settings = json.load(fp) - print("setting is {}".format(settings)) + print("setting is {}".format(settings)) + else: + print("not found setting file") args.exp_id = settings["experimentId"] args.platform = settings["platform"] # runner_id is unique runner in experiment, and will be updated if it's multi-nodes args.runner_id = "runner_"+os.path.basename(os.path.realpath(os.path.curdir)) + args.enable_gpu_collect = settings["enableGpuCollector"] + args.command_channel = settings["commandChannel"] if args.trial_command is None: args.trial_command = settings["command"] @@ -169,8 +194,9 @@ def run(self): from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log from .rest_utils import rest_get, rest_post from .url_utils import gen_parameter_meta_url, gen_send_version_url - from .protocol import CommandType, receive from .trial import Trial + from .file_channel import FileChannel + from .base_channel import CommandType is_multi_node = args.node_count > 1 From 2fa4a77e0046f32679ea0b6fe943980c84ac6b72 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 16 Jun 2020 19:06:21 +0800 Subject: [PATCH 26/98] init --- src/nni_manager/training_service/aml/amlTrainingService.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/nni_manager/training_service/aml/amlTrainingService.ts b/src/nni_manager/training_service/aml/amlTrainingService.ts index 9c84f8115d..0417e343ad 100644 --- a/src/nni_manager/training_service/aml/amlTrainingService.ts +++ b/src/nni_manager/training_service/aml/amlTrainingService.ts @@ -237,6 +237,7 @@ abstract class AMLTrainingService implements TrainingService { } let pyshell = new PythonShell('jobSubmission.py', { scriptPath: './config/aml', + pythonOptions: ['-u'], // get print results in real-time args: [ '--subscription_id', this.amlClusterConfig.subscriptionId, '--resource_group', this.amlClusterConfig.resourceGroup, From d0768b06ef438d4ab225f6d1be0a782345412c66 Mon Sep 17 00:00:00 2001 From: Chi Song <27178119+squirrelsc@users.noreply.github.com> Date: Wed, 17 Jun 2020 11:46:57 +0800 Subject: [PATCH 27/98] add more GPU information, and improve debugging. --- .../training_service/reusable/trialDispatcher.ts | 11 ++++++++--- tools/nni_trial_tool/gpu.py | 15 +++++++++++---- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index dcab057343..3ad9b5f24a 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -72,7 +72,8 @@ class TrialDispatcher implements TrainingService { const logLevel = getLogLevel(); this.log.debug(`current folder ${__dirname}`); - if (logLevel == "debug" && fs.existsSync("../../../src/nni_manager")) { + // different source folder in Linux and Windows + if (logLevel == "debug" && (fs.existsSync("../../../src/nni_manager") || __dirname.endsWith("src\\nni_manager\\dist\\training_service\\reusable"))) { this.log.debug("log level is debug, and exist code folder, so set to developing mode."); this.isDeveloping = true; this.runnerSettings.enableGpuCollector = true; @@ -177,7 +178,11 @@ class TrialDispatcher implements TrainingService { await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); if (this.isDeveloping) { - await storageService.copyDirectory("../nni/tools/nni_trial_tool", envDir, true); + let trialToolsPath = "../../../tools/nni_trial_tool"; + if (false === fs.existsSync(trialToolsPath)) { + trialToolsPath = path.join(__dirname, "..\\..\\..\\..\\..\\tools\\nni_trial_tool"); + } + await storageService.copyDirectory(trialToolsPath, envDir, true); } } @@ -274,7 +279,7 @@ class TrialDispatcher implements TrainingService { break; } if (oldIsAlive !== environment.isAlive) { - this.log.debug(`set environment isAlive from ${oldIsAlive} to ${environment.isAlive} due to status is ${environment.status}.`); + this.log.debug(`set environment ${environment.id} isAlive from ${oldIsAlive} to ${environment.isAlive} due to status is ${environment.status}.`); } }); await delay(5000); diff --git a/tools/nni_trial_tool/gpu.py b/tools/nni_trial_tool/gpu.py index 0517763710..016b5f98cf 100644 --- a/tools/nni_trial_tool/gpu.py +++ b/tools/nni_trial_tool/gpu.py @@ -16,14 +16,13 @@ def collect_gpu_usage(node_id): except Exception: traceback.print_exc() info = gen_empty_gpu_metric() - if node_id is not None: - info["node"] = node_id + info["node"] = node_id return info def parse_nvidia_smi_result(smi): - output = {} try: + output = {} xmldoc = minidom.parseString(smi) gpuList = xmldoc.getElementsByTagName('gpu') output["Timestamp"] = time.asctime(time.localtime()) @@ -42,19 +41,27 @@ def parse_nvidia_smi_result(smi): runningProNumber = len(processes[0].getElementsByTagName('process_info')) gpuInfo['activeProcessNum'] = runningProNumber + gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name').nodeValue + memUsage = gpu.getElementByTagName('fb_memory_usage')[0] + gpuInfo['gpuMemTotal'] = memUsage['total'].nodeValue.replace("MiB").strip() + gpuInfo['gpuMemUsed'] = memUsage['used'].nodeValue.replace("MiB").strip() + gpuInfo['gpuMemFree'] = memUsage['free'].nodeValue.replace("MiB").strip() + output["gpuInfos"].append(gpuInfo) except Exception: # e_info = sys.exc_info() traceback.print_exc() + output = {} return output def gen_empty_gpu_metric(): - output = {} try: + output = {} output["Timestamp"] = time.asctime(time.localtime()) output["gpuCount"] = 0 output["gpuInfos"] = [] except Exception: traceback.print_exc() + output = {} return output From 8dff16f0ff301bf2886481a665d37f851bfa2f1c Mon Sep 17 00:00:00 2001 From: Chi Song Date: Wed, 17 Jun 2020 12:46:38 +0800 Subject: [PATCH 28/98] fix GPU info collector --- .../training_service/reusable/trialDispatcher.ts | 2 +- tools/nni_trial_tool/gpu.py | 14 +++++++++----- tools/nni_trial_tool/trial.py | 10 +++++----- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 3ad9b5f24a..2eac82a12a 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -178,7 +178,7 @@ class TrialDispatcher implements TrainingService { await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); if (this.isDeveloping) { - let trialToolsPath = "../../../tools/nni_trial_tool"; + let trialToolsPath = path.join(__dirname, "../../../../../tools/nni_trial_tool"); if (false === fs.existsSync(trialToolsPath)) { trialToolsPath = path.join(__dirname, "..\\..\\..\\..\\..\\tools\\nni_trial_tool"); } diff --git a/tools/nni_trial_tool/gpu.py b/tools/nni_trial_tool/gpu.py index 016b5f98cf..b7e87d7fb6 100644 --- a/tools/nni_trial_tool/gpu.py +++ b/tools/nni_trial_tool/gpu.py @@ -41,11 +41,15 @@ def parse_nvidia_smi_result(smi): runningProNumber = len(processes[0].getElementsByTagName('process_info')) gpuInfo['activeProcessNum'] = runningProNumber - gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name').nodeValue - memUsage = gpu.getElementByTagName('fb_memory_usage')[0] - gpuInfo['gpuMemTotal'] = memUsage['total'].nodeValue.replace("MiB").strip() - gpuInfo['gpuMemUsed'] = memUsage['used'].nodeValue.replace("MiB").strip() - gpuInfo['gpuMemFree'] = memUsage['free'].nodeValue.replace("MiB").strip() + gpuInfo['gpuType'] = gpu.getElementsByTagName('product_name')[0]\ + .childNodes[0].data + memUsage = gpu.getElementsByTagName('fb_memory_usage')[0] + gpuInfo['gpuMemTotal'] = memUsage.getElementsByTagName('total')[0]\ + .childNodes[0].data.replace("MiB", "").strip() + gpuInfo['gpuMemUsed'] = memUsage.getElementsByTagName('used')[0]\ + .childNodes[0].data.replace("MiB", "").strip() + gpuInfo['gpuMemFree'] = memUsage.getElementsByTagName('free')[0]\ + .childNodes[0].data.replace("MiB", "").strip() output["gpuInfos"].append(gpuInfo) except Exception: diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index 1c9a11d0f4..b3dc0622be 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -49,9 +49,10 @@ def run(self): trial_code_dir = os.path.join(trial_working_dir, "code") trial_nnioutput_dir = os.path.join(trial_working_dir, "nnioutput") - os.environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) - os.environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") - os.environ['NNI_SYS_DIR'] = trial_working_dir + environ = os.environ.copy() + environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) + environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") + environ['NNI_SYS_DIR'] = trial_working_dir # prepare code and parameters prepared_flag_file_name = os.path.join(trial_working_dir, "trial_prepared") @@ -82,10 +83,9 @@ def run(self): break time.sleep(0.1) - # Notice: We don't appoint env, which means subprocess wil inherit current environment and that is expected behavior self.log_pipe_stdout = self.trial_syslogger_stdout.get_pipelog_reader() self.process = Popen(self.args.trial_command, shell=True, stdout=self.log_pipe_stdout, - stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=dict(os.environ)) + stderr=self.log_pipe_stdout, cwd=trial_code_dir, env=dict(environ)) nni_log(LogType.Info, '{0}: spawns a subprocess (pid {1}) to run command: {2}'. format(self.name, self.process.pid, shlex.split(self.args.trial_command))) From e297aa5d9e1fe6a449fad19526621bbfce0dbce0 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Thu, 18 Jun 2020 12:05:17 +0800 Subject: [PATCH 29/98] update --- src/nni_manager/main.ts | 3 +- .../reusable/amlEnvironmentService.ts | 110 ++++++++++++++++++ .../reusable/routerTrainingService.ts | 29 +++++ 3 files changed, 140 insertions(+), 2 deletions(-) create mode 100644 src/nni_manager/training_service/reusable/amlEnvironmentService.ts diff --git a/src/nni_manager/main.ts b/src/nni_manager/main.ts index 243e4cc982..9e60647e9b 100644 --- a/src/nni_manager/main.ts +++ b/src/nni_manager/main.ts @@ -23,7 +23,6 @@ import { KubeflowTrainingService } from './training_service/kubernetes/kubeflow/ import { LocalTrainingService } from './training_service/local/localTrainingService'; import { RouterTrainingService } from './training_service/reusable/routerTrainingService'; import { PAIYarnTrainingService } from './training_service/pai/paiYarn/paiYarnTrainingService'; -import { AMLTrainingService } from './training_service/aml/amlTrainingService'; import { RemoteMachineTrainingService } from './training_service/remote_machine/remoteMachineTrainingService'; @@ -68,7 +67,7 @@ async function initContainer(foreground: boolean, platformMode: string, logFileN .scope(Scope.Singleton); } else if (platformMode === 'aml') { Container.bind(TrainingService) - .to(AMLTrainingService) + .to(RouterTrainingService) .scope(Scope.Singleton); } else { throw new Error(`Error: unsupported mode: ${platformMode}`); diff --git a/src/nni_manager/training_service/reusable/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/amlEnvironmentService.ts new file mode 100644 index 0000000000..22c7dd56c1 --- /dev/null +++ b/src/nni_manager/training_service/reusable/amlEnvironmentService.ts @@ -0,0 +1,110 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as fs from 'fs'; +import * as request from 'request'; +import { Deferred } from 'ts-deferred'; +import * as component from '../../common/component'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../common/log'; +import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; +import { AMLClusterConfig, AMLTrialConfig, AMLTrialJobDetail } from '../aml/amlConfig'; +import { EnvironmentInformation, EnvironmentService } from './environment'; +import { StorageService } from './storageService'; +import { + NNIManagerIpConfig, TrainingService, + TrialJobApplicationForm, TrialJobDetail, TrialJobMetric +} from '../../common/trainingService'; +import { execMkdir, validateCodeDir, execCopydir } from '../common/util'; +import { + generateParamFileName, + getIPV4Address, getVersion, uniqueString +} from '../../common/utils'; + +const yaml = require('js-yaml'); + +/** + * Collector PAI jobs info from PAI cluster, and update pai job status locally + */ +@component.Singleton +export class AMLEnvironmentService implements EnvironmentService { + + private readonly log: Logger = getLogger(); + private amlClusterConfig: AMLClusterConfig | undefined; + private amlTrialConfig: AMLTrialConfig | undefined; + private amlJobConfig: any; + private stopping: boolean = false; + private versionCheck: boolean = true; + private isMultiPhase: boolean = false; + private nniVersion?: string; + private experimentId: string; + private nniManagerIpConfig?: NNIManagerIpConfig; + + constructor() { + this.experimentId = getExperimentId(); + } + + public get hasStorageService(): boolean { + return true; + } + + public async config(key: string, value: string): Promise { + switch (key) { + case TrialConfigMetadataKey.NNI_MANAGER_IP: + this.nniManagerIpConfig = JSON.parse(value); + break; + + case TrialConfigMetadataKey.AML_CLUSTER_CONFIG: + this.amlClusterConfig = JSON.parse(value); + break; + + case TrialConfigMetadataKey.TRIAL_CONFIG: { + if (this.amlClusterConfig === undefined) { + this.log.error('aml cluster config is not initialized'); + break; + } + this.amlTrialConfig = JSON.parse(value); + // Validate to make sure codeDir doesn't have too many files + await validateCodeDir(this.amlTrialConfig.codeDir); + break; + } + case TrialConfigMetadataKey.VERSION_CHECK: + this.versionCheck = (value === 'true' || value === 'True'); + this.nniVersion = this.versionCheck ? await getVersion() : ''; + break; + case TrialConfigMetadataKey.MULTI_PHASE: + this.isMultiPhase = (value === 'true' || value === 'True'); + break; + default: + //Reject for unknown keys + this.log.error(`Uknown key: ${key}`); + } + } + + public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { + } + + public async startEnvironment(environment: EnvironmentInformation): Promise { + } + + public async stopEnvironment(environment: EnvironmentInformation): Promise { + } +} diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 8ce766224d..cd62919a04 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -25,11 +25,14 @@ import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetri import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; +import { AMLClusterConfig } from '../aml/amlConfig'; import { PAIK8STrainingService } from '../pai/paiK8S/paiK8STrainingService'; +import { AMLTrainingService } from '../aml/amlTrainingService'; import { TrialDispatcher } from './trialDispatcher'; import { Container, Scope } from 'typescript-ioc'; import { EnvironmentService } from './environment'; import { OpenPaiEnvironmentService } from './openPaiEnvironmentService'; +import { AMLEnvironmentService } from './amlEnvironmentService'; import { StorageService } from './storageService'; import { MountedStorageService } from './mountedStorageService'; import { TrialService } from './trial'; @@ -142,6 +145,32 @@ class RouterTrainingService implements TrainingService { } await this.internalTrainingService.setClusterMetadata(key, value); + this.metaDataCache.clear(); + } else if (key === TrialConfigMetadataKey.AML_CLUSTER_CONFIG) { + const config = JSON.parse(value); + this.internalTrainingService = component.get(TrialDispatcher); + + Container.bind(EnvironmentService) + .to(AMLEnvironmentService) + .scope(Scope.Singleton); + Container.bind(StorageService) + .to(MountedStorageService) + .scope(Scope.Singleton); + Container.bind(TrialService) + .to(StorageTrialService) + .scope(Scope.Singleton); + for (const [key, value] of this.metaDataCache) { + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + await this.internalTrainingService.setClusterMetadata(key, value); + } + + if (this.internalTrainingService === undefined) { + throw new Error("TrainingService is not assigned!"); + } + await this.internalTrainingService.setClusterMetadata(key, value); + this.metaDataCache.clear(); } else { this.log.debug(`caching metadata key:{} value:{}, as training service is not determined.`); From 500c1cba8d5aba48dcaaa4ea36607a38d9e20e85 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Thu, 18 Jun 2020 15:20:35 +0800 Subject: [PATCH 30/98] channel support single file --- .../reusable/mountedStorageService.ts | 5 + .../reusable/storageService.ts | 22 +++- .../reusable/storageTrialService.ts | 20 +-- tools/nni_trial_tool/base_channel.py | 35 +++-- tools/nni_trial_tool/file_channel.py | 97 +++++++++----- tools/nni_trial_tool/test/__init__.py | 11 ++ .../nni_trial_tool/test/test_file_channel.py | 123 ++++++++++++++++++ 7 files changed, 243 insertions(+), 70 deletions(-) create mode 100644 tools/nni_trial_tool/test/__init__.py create mode 100644 tools/nni_trial_tool/test/test_file_channel.py diff --git a/src/nni_manager/training_service/reusable/mountedStorageService.ts b/src/nni_manager/training_service/reusable/mountedStorageService.ts index 9c4c6f9d13..27c3cb7034 100644 --- a/src/nni_manager/training_service/reusable/mountedStorageService.ts +++ b/src/nni_manager/training_service/reusable/mountedStorageService.ts @@ -138,6 +138,11 @@ export class MountedStorageService extends StorageService { return results; } + protected async internalAttach(remotePath: string, content: string): Promise { + await fs.promises.appendFile(remotePath, content + "\n"); + return true; + } + protected internalIsRelativePath(remotePath: string): boolean { return !path.isAbsolute(remotePath); } diff --git a/src/nni_manager/training_service/reusable/storageService.ts b/src/nni_manager/training_service/reusable/storageService.ts index 402203e5c4..1d08643fcf 100644 --- a/src/nni_manager/training_service/reusable/storageService.ts +++ b/src/nni_manager/training_service/reusable/storageService.ts @@ -39,6 +39,7 @@ export abstract class StorageService { protected abstract async internalExists(remotePath: string): Promise; protected abstract async internalRead(remotePath: string, offset: number, length: number): Promise; protected abstract async internalList(remotePath: string): Promise; + protected abstract async internalAttach(remotePath: string, content: string): Promise; protected abstract internalIsRelativePath(path: string): boolean; protected abstract internalJoin(...paths: string[]): string; protected abstract internalDirname(...paths: string[]): string; @@ -126,7 +127,7 @@ export abstract class StorageService { return exists } - public async save(content: string, remotePath: string): Promise { + public async save(content: string, remotePath: string, isAttach: boolean = false): Promise { remotePath = this.expandPath(true, remotePath); this.logger.debug(`saving content to remotePath: ${remotePath}, length: ${content.length}`); const fileName = this.internalBasename(remotePath); @@ -137,13 +138,20 @@ export abstract class StorageService { const remoteDir = this.internalDirname(remotePath); const remoteTempFile = this.internalJoin(remoteDir, tempFileName); - if (await this.internalExists(remotePath) === true) { - await this.internalRemove(remotePath, false, false); + if (isAttach) { + const result = await this.internalAttach(remotePath, content); + if (false === result){ + throw new Error("this.internalAttach doesn't support"); + } + } else { + if (await this.internalExists(remotePath) === true) { + await this.internalRemove(remotePath, false, false); + } + await fs.promises.writeFile(localTempFileName, content); + await this.internalCopy(localTempFileName, remoteDir, false, false, true); + await this.rename(remoteTempFile, fileName); + await fs.promises.unlink(localTempFileName); } - await fs.promises.writeFile(localTempFileName, content); - await this.internalCopy(localTempFileName, remoteDir, false, false, true); - await this.rename(remoteTempFile, fileName); - await fs.promises.unlink(localTempFileName); } public async copyFile(localPath: string, remotePath: string): Promise { diff --git a/src/nni_manager/training_service/reusable/storageTrialService.ts b/src/nni_manager/training_service/reusable/storageTrialService.ts index 25e4651601..16c53c2e98 100644 --- a/src/nni_manager/training_service/reusable/storageTrialService.ts +++ b/src/nni_manager/training_service/reusable/storageTrialService.ts @@ -105,28 +105,12 @@ export class StorageTrialService extends TrialService { private async sendCommand(commantType: string, data: any, environment: EnvironmentInformation): Promise { let retryCount = 10; - let fileName: string; - let filePath: string = ""; let findingName: boolean = true; const command = encodeCommand(commantType, JSON.stringify(data)); const storageService = component.get(StorageService); - const commandPath = storageService.joinPath(environment.workingFolder, `commands`); - - while (findingName) { - fileName = `manager_command_${new Date().getTime()}.txt`; - filePath = storageService.joinPath(commandPath, fileName); - if (!await storageService.exists(filePath)) { - findingName = false; - break; - } - if (retryCount == 0) { - throw new Error(`EnvironmentManager retry too many times to send command!`); - } - retryCount--; - await delay(1); - } + const fileName = storageService.joinPath(environment.workingFolder, `commands`, `manager_commands.txt`); // prevent to have imcomplete command, so save as temp name and then rename. - await storageService.save(command.toString("utf8"), filePath); + await storageService.save(command.toString("utf8"), fileName, true); } } diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py index 9f7e5431fa..9d566ef996 100644 --- a/tools/nni_trial_tool/base_channel.py +++ b/tools/nni_trial_tool/base_channel.py @@ -2,11 +2,12 @@ # Licensed under the MIT license. import json +import threading import time from abc import ABC, abstractmethod +from datetime import datetime from enum import Enum -from queue import Queue, Empty -import threading +from queue import Empty, Queue from .log_utils import LogType, nni_log @@ -15,7 +16,7 @@ class CommandType(Enum): Initialize = b'IN' RequestTrialJobs = b'GE' ReportMetricData = b'ME' - ReportGpuInfo = b'GP' + ReportGpuInfo = b'GI' UpdateSearchSpace = b'SS' ImportData = b'FD' AddCustomizedTrialJob = b'AD' @@ -30,6 +31,9 @@ class CommandType(Enum): KillTrialJob = b'KI' +INTERVAL_SECONDS = 0.5 + + class BaseChannel(ABC): def __init__(self, args): self.is_keep_parsed = args.node_count > 1 @@ -58,19 +62,20 @@ def _receive_loop(self): if messages is not None: for message in messages: self.receive_queue.put(message) - time.sleep(0.5) + time.sleep(INTERVAL_SECONDS) def _send_loop(self): while (self.is_running): + message = None try: - # no sleep, since it's a block call with 1 second timeout - message = self.send_queue.get(True, 1) - if message is not None: - nni_log(LogType.Info, 'Sending command, data: [%s]' % message) - self._inner_send(message) + # no sleep, since it's a block call with INTERVAL_SECONDS second timeout + message = self.send_queue.get(True, INTERVAL_SECONDS) except Empty: # do nothing, if no command received. pass + if message is not None: + nni_log(LogType.Info, 'Sending command: %s' % message) + self._inner_send(message) def close(self): self.is_running = False @@ -86,6 +91,12 @@ def send(self, command, data): message = b'%b%014d%b' % (command.value, len(data), data) self.send_queue.put(message) + def sent(self): + return self.send_queue.qsize() == 0 + + def received(self): + return self.receive_queue.qsize() > 0 + def receive(self): """Receive a command from Training Service. Returns a tuple of command (CommandType) and payload (str) @@ -101,15 +112,15 @@ def receive(self): nni_log(LogType.Error, 'incorrect command is found, command must be greater than 16 bytes!') return None, None header = command_content[:16] - nni_log(LogType.Info, 'Received command, header: [%s]' % header) command = CommandType(header[:2]) length = int(header[2:]) if (len(command_content)-16 != length): - nni_log(LogType.Error, 'incorrect command length, length {}, actual data length is {}.'.format(length, len(command)-16)) + nni_log(LogType.Error, 'incorrect command length, length {}, actual data length is {}, header {}.' + .format(length, len(command_content)-16, header)) return None, None data = command_content[16:16+length] data = json.loads(data.decode('utf8')) - nni_log(LogType.Info, 'Received command, data: [%s]' % data) + nni_log(LogType.Info, 'Received command, header: [%s], data: [%s]' % (header, data)) except Empty: # do nothing, if no command received. pass diff --git a/tools/nni_trial_tool/file_channel.py b/tools/nni_trial_tool/file_channel.py index a2fcb4f0e0..3d728813a0 100644 --- a/tools/nni_trial_tool/file_channel.py +++ b/tools/nni_trial_tool/file_channel.py @@ -9,50 +9,81 @@ from .log_utils import LogType, nni_log command_path = "./commands" -runner_command_prefix = "runner_command_" -manager_command_prefix = "manager_command_" +runner_commands_file_name_prefix = "runner_commands" +manager_commands_file_name = "manager_commands.txt" + class FileChannel(BaseChannel): def __init__(self, args): + self.node_id = args.node_id + self.out_file = None + self.in_file = None + self.in_offset = 0 + self.in_cache = b"" + super(FileChannel, self).__init__(args) - self.parsed_commands = set() + + def close(self): + super(FileChannel, self).close() + if self.out_file is not None: + self.out_file.close() + self.out_file = None + if self.in_file is not None: + self.in_file.close() + self.in_file = None def _inner_send(self, message): - if not os.path.exists(command_path): - os.makedirs(command_path, exist_ok=True) - while True: - file_name = os.path.join(command_path, "%s%s.txt" % ( - runner_command_prefix, int(datetime.now().timestamp() * 1000))) - if not os.path.exists(file_name): - break - time.sleep(0.01) - with open(file_name, "wb") as out_file: - out_file.write(message) + if self.out_file is None: + if not os.path.exists(command_path): + os.makedirs(command_path, exist_ok=True) + + if self.node_id is None: + file_name = os.path.join(command_path, "%s.txt" % runner_commands_file_name_prefix) + else: + file_name = os.path.join(command_path, "%s_%s.txt" % ( + runner_commands_file_name_prefix, self.node_id)) + self.out_file = open(file_name, "ab") + + self.out_file.write(message) + self.out_file.write(b'\n') + self.out_file.flush() + + def _open_manager_command(self): + manager_command_file_name = os.path.join(command_path, manager_commands_file_name) + + if self.in_file is not None and self.in_file.closed: + self.in_file = None + + if self.in_file is None and os.path.exists(manager_command_file_name): + self.in_file = open(manager_command_file_name, "rb") + self.in_file.seek(self.in_offset) def _inner_receive(self): messages = [] - pending_commands = [] - if os.path.exists(command_path): - command_files = os.listdir(command_path) - for file_name in command_files: - if (file_name.startswith(manager_command_prefix)) and file_name not in self.parsed_commands: - pending_commands.append(file_name) - pending_commands.sort() - - for file_name in pending_commands: - full_file_name = os.path.join(command_path, file_name) - with open(full_file_name, "rb") as in_file: - header = in_file.read(16) - if header is None or len(header) < 16: - # invalid header - nni_log(LogType.Error, 'incorrect command is found!') - return None + if self.in_file is None: + self._open_manager_command() + if self.in_file is not None: + self.in_file.seek(0, os.SEEK_END) + new_offset = self.in_file.tell() + self.in_file.seek(self.in_offset, os.SEEK_SET) + count = new_offset - self.in_offset + if count > 0: + self.in_cache += self.in_file.read(count) + self.in_offset = new_offset + while(len(self.in_cache)) >= 16: + header = self.in_cache[:16] length = int(header[2:]) - data = in_file.read(length) + + # consider there is an \n at end of a message. + total_length = length+16+1 + # break, if buffer is too short. + if len(self.in_cache) < total_length: + break + data = self.in_cache[16:total_length-1] + if 10 != self.in_cache[total_length-1]: + nni_log(LogType.Error, 'end of message should be \\n, but got {}'.format(self.in_cache[total_length-1])) + self.in_cache = self.in_cache[total_length:] messages.append(header + data) - if not self.is_keep_parsed: - os.remove(full_file_name) - self.parsed_commands.add(file_name) return messages diff --git a/tools/nni_trial_tool/test/__init__.py b/tools/nni_trial_tool/test/__init__.py new file mode 100644 index 0000000000..eb551d5088 --- /dev/null +++ b/tools/nni_trial_tool/test/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import os + +os.environ['NNI_PLATFORM'] = 'unittest' +os.environ['NNI_TRIAL_JOB_ID'] = 'test_trial_job_id' +os.environ["NNI_OUTPUT_DIR"] = "./unittest" +os.environ["NNI_SYS_DIR"] = "./unittest" +os.environ["NNI_EXP_ID"] = "test_exp_id" +os.environ["MULTI_PHASE"] = "true" diff --git a/tools/nni_trial_tool/test/test_file_channel.py b/tools/nni_trial_tool/test/test_file_channel.py new file mode 100644 index 0000000000..e08bd7fca9 --- /dev/null +++ b/tools/nni_trial_tool/test/test_file_channel.py @@ -0,0 +1,123 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import json +import os +import random +import shutil +import string +import sys +import time +import unittest +from argparse import Namespace +from datetime import datetime + +from tools.nni_trial_tool.base_channel import CommandType +from tools.nni_trial_tool.file_channel import (FileChannel, command_path, + manager_commands_file_name) + +sys.path.append("..") + +runner_file_name = "commands/runner_commands.txt" +manager_file_name = "commands/manager_commands.txt" + + +class FileChannelTest(unittest.TestCase): + + def setUp(self): + self.args = Namespace() + self.args.node_count = 1 + self.args.node_id = None + if os.path.exists(command_path): + shutil.rmtree(command_path) + + def test_send(self): + fc = None + try: + fc = FileChannel(self.args) + fc.send(CommandType.ReportGpuInfo, "command1") + fc.send(CommandType.ReportGpuInfo, "command2") + + self.check_timeout(2, lambda: os.path.exists(runner_file_name)) + + self.assertTrue(os.path.exists(runner_file_name)) + with open(runner_file_name, "rb") as runner: + lines = runner.readlines() + self.assertListEqual(lines, [b'GI00000000000010"command1"\n', b'GI00000000000010"command2"\n']) + finally: + if fc is not None: + fc.close() + + def test_send_multi_node(self): + fc1 = None + fc2 = None + try: + runner1_file_name = "commands/runner_commands_1.txt" + self.args.node_id = 1 + fc1 = FileChannel(self.args) + fc1.send(CommandType.ReportGpuInfo, "command1") + # wait command have enough time to write before closed. + + runner2_file_name = "commands/runner_commands_2.txt" + self.args.node_id = 2 + fc2 = FileChannel(self.args) + fc2.send(CommandType.ReportGpuInfo, "command1") + + self.check_timeout(2, lambda: os.path.exists(runner1_file_name) and os.path.exists(runner2_file_name)) + + self.assertTrue(os.path.exists(runner1_file_name)) + with open(runner1_file_name, "rb") as runner: + lines1 = runner.readlines() + self.assertTrue(os.path.exists(runner2_file_name)) + with open(runner2_file_name, "rb") as runner: + lines2 = runner.readlines() + self.assertListEqual(lines1, [b'GI00000000000010"command1"\n']) + self.assertListEqual(lines2, [b'GI00000000000010"command1"\n']) + finally: + if fc1 is not None: + fc1.close() + if fc2 is not None: + fc2.close() + + def test_receive(self): + fc = None + manager_file = None + try: + fc = FileChannel(self.args) + message = fc.receive() + self.assertEqual(message, (None, None)) + + os.mkdir(command_path) + manager_file = open(manager_file_name, "wb") + manager_file.write(b'TR00000000000009"manager"\n') + manager_file.flush() + + self.check_timeout(2, lambda: fc.received()) + message = fc.receive() + self.assertEqual(message, (CommandType.NewTrialJob, "manager")) + + manager_file.write(b'TR00000000000010"manager2"\n') + manager_file.flush() + + self.check_timeout(2, lambda: fc.received()) + message = fc.receive() + self.assertEqual(message, (CommandType.NewTrialJob, "manager2")) + finally: + if fc is not None: + fc.close() + if manager_file is not None: + manager_file.close() + + def check_timeout(self, timeout, callback): + interval = 0.01 + start = datetime.now().timestamp() + count = int(timeout / interval) + for x in range(count): + if callback(): + break + time.sleep(interval) + print("checked {} times, {:3F} seconds".format(x, datetime.now().timestamp()-start)) + + +if __name__ == '__main__': + unittest.main() From d880512cb3e724ac6a2d25f3e5b33251ac12e7da Mon Sep 17 00:00:00 2001 From: Chi Song Date: Fri, 19 Jun 2020 18:24:48 +0800 Subject: [PATCH 31/98] refine code, and implement command channel --- src/nni_manager/core/commands.ts | 12 ++ .../training_service/common/gpuData.ts | 13 +- .../reusable/channels/fileCommandChannel.ts | 160 ++++++++++++++++++ .../reusable/commandChannel.ts | 96 +++++++++++ .../training_service/reusable/environment.ts | 4 +- .../openPaiEnvironmentService.ts | 16 +- .../reusable/routerTrainingService.ts | 12 +- .../reusable/storageService.ts | 22 +-- .../reusable/storageTrialService.ts | 116 ------------- .../{ => storages}/mountedStorageService.ts | 7 +- .../training_service/reusable/trial.ts | 17 +- .../reusable/trialDispatcher.ts | 103 +++++++++-- .../reusable/trials/storageTrialService.ts | 69 ++++++++ tools/nni_trial_tool/base_channel.py | 1 + tools/nni_trial_tool/file_channel.py | 6 +- tools/nni_trial_tool/gpu.py | 1 - tools/nni_trial_tool/trial.py | 18 +- tools/nni_trial_tool/trial_runner.py | 1 + 18 files changed, 500 insertions(+), 174 deletions(-) create mode 100644 src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts create mode 100644 src/nni_manager/training_service/reusable/commandChannel.ts rename src/nni_manager/training_service/reusable/{ => environments}/openPaiEnvironmentService.ts (97%) delete mode 100644 src/nni_manager/training_service/reusable/storageTrialService.ts rename src/nni_manager/training_service/reusable/{ => storages}/mountedStorageService.ts (97%) create mode 100644 src/nni_manager/training_service/reusable/trials/storageTrialService.ts diff --git a/src/nni_manager/core/commands.ts b/src/nni_manager/core/commands.ts index 7e626a6859..6e308808ec 100644 --- a/src/nni_manager/core/commands.ts +++ b/src/nni_manager/core/commands.ts @@ -12,12 +12,22 @@ const TRIAL_END = 'EN'; const TERMINATE = 'TE'; const PING = 'PI'; +const GPU_INFO = 'GI'; + const INITIALIZED = 'ID'; const NEW_TRIAL_JOB = 'TR'; const SEND_TRIAL_JOB_PARAMETER = 'SP'; const NO_MORE_TRIAL_JOBS = 'NO'; const KILL_TRIAL_JOB = 'KI'; +const TRIAL_COMMANDS: Set = new Set([ + NEW_TRIAL_JOB, + TRIAL_END, + SEND_TRIAL_JOB_PARAMETER, + GPU_INFO, + KILL_TRIAL_JOB, +]); + const TUNER_COMMANDS: Set = new Set([ INITIALIZE, REQUEST_TRIAL_JOBS, @@ -53,11 +63,13 @@ export { TRIAL_END, TERMINATE, PING, + GPU_INFO, INITIALIZED, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, KILL_TRIAL_JOB, TUNER_COMMANDS, ASSESSOR_COMMANDS, + TRIAL_COMMANDS, SEND_TRIAL_JOB_PARAMETER }; diff --git a/src/nni_manager/training_service/common/gpuData.ts b/src/nni_manager/training_service/common/gpuData.ts index caad31eaa9..655c885d59 100644 --- a/src/nni_manager/training_service/common/gpuData.ts +++ b/src/nni_manager/training_service/common/gpuData.ts @@ -16,12 +16,21 @@ export class GPUInfo { public gpuUtil: number; // the index number of this GPU (starting from 0) public readonly index: number; + public gpuMemTotal: number; + public gpuMemFree: number; + public gpuMemUsed: number; + public gpuType: string; - constructor(activeProcessNum: number, gpuMemUtil: number, gpuUtil: number, index: number) { + constructor(activeProcessNum: number, gpuMemUtil: number, gpuUtil: number, index: number, + gpuMemTotal: number, gpuMemFree: number, gpuMemUsed: number, gpuType: string) { this.activeProcessNum = activeProcessNum; this.gpuMemUtil = gpuMemUtil; this.gpuUtil = gpuUtil; this.index = index; + this.gpuMemTotal = gpuMemTotal; + this.gpuMemFree = gpuMemFree; + this.gpuMemUsed = gpuMemUsed; + this.gpuType = gpuType; } } @@ -44,7 +53,7 @@ export class GPUSummary { } export const GPU_INFO_COLLECTOR_FORMAT_WINDOWS: string = -` + ` $env:METRIC_OUTPUT_DIR="{0}" $app = Start-Process "python" -ArgumentList "-m nni_gpu_tool.gpu_metrics_collector" -passthru -NoNewWindow Write $app.ID | Out-File {1} -NoNewline -encoding utf8 diff --git a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts new file mode 100644 index 0000000000..29ac2171a7 --- /dev/null +++ b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts @@ -0,0 +1,160 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as component from "../../../common/component"; +import { delay } from "../../../common/utils"; +import { CommandChannel } from "../commandChannel"; +import { EnvironmentInformation } from "../environment"; +import { StorageService } from "../storageService"; + +class FileHandler { + public fileName: string; + public offset: number = 0; + + constructor(fileName: string) { + this.fileName = fileName; + } +} + + +class EnvironmentHandler { + public environment: EnvironmentInformation; + public handlers: Map = new Map(); + + constructor(environment: EnvironmentInformation) { + this.environment = environment; + } +} + +export class FileCommandChannel extends CommandChannel { + private readonly commandPath = "commands"; + private stopping: boolean = false; + // each node have a receiver + private receive_handlers: Map = new Map(); + // make sure no concurrent issue when sending commands. + private send_queues: [EnvironmentInformation, string][] = []; + + public start(): void { + // start command loops + this.receiveLoop(); + this.sendLoop(); + } + + public stop(): void { + this.stopping = true; + } + + public async open(environment: EnvironmentInformation): Promise { + if (this.receive_handlers.has(environment.id)) { + throw new Error(`FileCommandChannel: env ${environment.id} is opened already, shouldn't be opened again.`); + } + this.receive_handlers.set(environment.id, new EnvironmentHandler(environment)); + } + + public async close(environment: EnvironmentInformation): Promise { + if (this.receive_handlers.has(environment.id)) { + this.receive_handlers.delete(environment.id); + } + } + + protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise { + this.send_queues.push([environment, message]); + } + + private async sendLoop(): Promise { + const intervalSeconds = 0.5; + while (!this.stopping) { + const start = new Date(); + + if (this.send_queues.length > 0) { + const storageService = component.get(StorageService); + + while (this.send_queues.length > 0) { + const item = this.send_queues.shift(); + if (item === undefined) { + break; + } + const environment = item[0]; + const message = `${item[1]}\n`; + + const fileName = storageService.joinPath(environment.workingFolder, this.commandPath, `manager_commands.txt`); + await storageService.save(message, fileName, true); + } + } + + const end = new Date(); + const delayMs = intervalSeconds * 1000 - (end.valueOf() - start.valueOf()); + if (delayMs > 0) { + await delay(delayMs); + } + } + } + + private async receiveLoop(): Promise { + const intervalSeconds = 2; + const storageService = component.get(StorageService); + + while (!this.stopping) { + const start = new Date(); + + const envs = [...this.receive_handlers.values()]; + for (const environmentHandler of envs) { + const envCommandFolder = storageService.joinPath(environmentHandler.environment.workingFolder, this.commandPath); + // open new command files + if (environmentHandler.handlers.size < environmentHandler.environment.nodeCount) { + // to find all node commands file + const commandFileNames = await storageService.listDirectory(envCommandFolder); + const toAddedFileNames = []; + for (const commandFileName of commandFileNames) { + if (commandFileName.startsWith("runner_commands") && !environmentHandler.handlers.has(commandFileName)) { + toAddedFileNames.push(commandFileName); + } + } + + for (const toAddedFileName of toAddedFileNames) { + const fullPath = storageService.joinPath(envCommandFolder, toAddedFileName); + const fileHandler: FileHandler = new FileHandler(fullPath); + environmentHandler.handlers.set(toAddedFileName, fileHandler); + this.log.debug(`FileCommandChannel: added fileHandler env ${environmentHandler.environment.id} ${toAddedFileName}`); + } + } + + // to loop all commands + for (const fileHandler of environmentHandler.handlers.values()) { + const newContent = await storageService.readFileContent(fileHandler.fileName, fileHandler.offset, undefined); + if (newContent.length > 0) { + const commands = newContent.split('\n'); + for (const command of commands) { + this.handleCommand(environmentHandler.environment, command); + } + fileHandler.offset += newContent.length; + } + } + } + + const end = new Date(); + const delayMs = intervalSeconds * 1000 - (end.valueOf() - start.valueOf()); + if (delayMs > 0) { + await delay(delayMs); + } + } + } +} diff --git a/src/nni_manager/training_service/reusable/commandChannel.ts b/src/nni_manager/training_service/reusable/commandChannel.ts new file mode 100644 index 0000000000..29da8781b8 --- /dev/null +++ b/src/nni_manager/training_service/reusable/commandChannel.ts @@ -0,0 +1,96 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { EventEmitter } from "events"; +import { TRIAL_COMMANDS } from "../../core/commands"; +import { encodeCommand } from "../../core/ipcInterface"; +import { EnvironmentInformation } from "./environment"; +import { Logger, getLogger } from "../../common/log"; + +const acceptedCommands: Set = new Set(TRIAL_COMMANDS); + +export enum ChannelType { + API = "api", + Storage = "storage", +} + +export class Command { + public readonly environment: EnvironmentInformation; + public readonly command: string; + public readonly data: any; + + constructor(environment: EnvironmentInformation, command: string, data: any) { + if (!acceptedCommands.has(command)) { + throw new Error(`unaccepted command ${command}`); + } + this.environment = environment; + this.command = command; + this.data = data; + } +} + +export abstract class CommandChannel { + protected readonly log: Logger; + + protected readonly commandEmitter: EventEmitter; + private readonly commandPattern: RegExp = /(?[\w]{2})(?[\d]{14})(?.*)\n?/gm; + + public constructor(commandEmitter: EventEmitter) { + this.log = getLogger(); + this.commandEmitter = commandEmitter; + } + + public abstract start(): void; + public abstract stop(): void; + + public abstract open(environment: EnvironmentInformation): Promise; + public abstract close(environment: EnvironmentInformation): Promise; + + protected abstract sendCommandInternal(environment: EnvironmentInformation, message: string): Promise; + + public async sendCommand(environment: EnvironmentInformation, commantType: string, data: any): Promise { + const command = encodeCommand(commantType, JSON.stringify(data)); + await this.sendCommandInternal(environment, command.toString("utf8")); + this.log.debug(`CommandChannel: env ${environment.id} sent command: ${command}`); + } + + protected handleCommand(environment: EnvironmentInformation, content: string): void { + let matches = this.commandPattern.exec(content); + + while (matches) { + if (undefined !== matches.groups) { + const commandType = matches.groups["type"]; + const dataLength = parseInt(matches.groups["length"]); + let data: any = matches.groups["data"]; + if (dataLength !== data.length) { + throw new Error(`dataLength ${dataLength} not equal to actual length ${data.length}: ${data}`); + } + // to handle encode('utf8') of Python + data = JSON.parse('"' + data.split('"').join('\\"') + '"'); + const finalData = JSON.parse(data); + const command = new Command(environment, commandType, finalData); + this.commandEmitter.emit("command", command); + this.log.debug(`CommandChannel: env ${environment.id} emit command: ${commandType}, ${data}`); + } + matches = this.commandPattern.exec(content); + } + } +} diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 0f9b42c1c6..e9ca1f6be8 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -46,7 +46,7 @@ export class RunnerSettings { // specify which communication channel is used by runner. // supported channel includes: api, storage, aml - public commandChannel: string = "api"; + public commandChannel: string = "file"; } export class EnvironmentInformation { @@ -70,7 +70,7 @@ export class EnvironmentInformation { public command: string = ""; public nodeCount: number = 1; - public gpuSummary: GPUSummary | undefined; + public gpuSummary: Map = new Map(); constructor(id: string, jobName: string, jobId?: string) { this.id = id; diff --git a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts similarity index 97% rename from src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts rename to src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 174f77a717..6895cc3632 100644 --- a/src/nni_manager/training_service/reusable/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -22,14 +22,14 @@ import * as fs from 'fs'; import * as request from 'request'; import { Deferred } from 'ts-deferred'; -import * as component from '../../common/component'; -import { getExperimentId } from '../../common/experimentStartupInfo'; -import { getLogger, Logger } from '../../common/log'; -import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { PAIClusterConfig } from '../pai/paiConfig'; -import { NNIPAIK8STrialConfig } from '../pai/paiK8S/paiK8SConfig'; -import { EnvironmentInformation, EnvironmentService } from './environment'; -import { StorageService } from './storageService'; +import * as component from '../../../common/component'; +import { getExperimentId } from '../../../common/experimentStartupInfo'; +import { getLogger, Logger } from '../../../common/log'; +import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; +import { PAIClusterConfig } from '../../pai/paiConfig'; +import { NNIPAIK8STrialConfig } from '../../pai/paiK8S/paiK8SConfig'; +import { EnvironmentInformation, EnvironmentService } from '../environment'; +import { StorageService } from '../storageService'; const yaml = require('js-yaml'); diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 8ce766224d..afbcc4ff9f 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -29,11 +29,13 @@ import { PAIK8STrainingService } from '../pai/paiK8S/paiK8STrainingService'; import { TrialDispatcher } from './trialDispatcher'; import { Container, Scope } from 'typescript-ioc'; import { EnvironmentService } from './environment'; -import { OpenPaiEnvironmentService } from './openPaiEnvironmentService'; +import { OpenPaiEnvironmentService } from './environments/openPaiEnvironmentService'; import { StorageService } from './storageService'; -import { MountedStorageService } from './mountedStorageService'; +import { MountedStorageService } from './storages/mountedStorageService'; import { TrialService } from './trial'; -import { StorageTrialService } from './storageTrialService'; +import { StorageTrialService } from './trials/storageTrialService'; +import { CommandChannel } from './commandChannel'; +import { FileCommandChannel } from './channels/fileCommandChannel'; /** @@ -126,6 +128,10 @@ class RouterTrainingService implements TrainingService { Container.bind(TrialService) .to(StorageTrialService) .scope(Scope.Singleton); + + Container.bind(CommandChannel) + .to(FileCommandChannel) + .scope(Scope.Singleton); } else { this.log.debug(`caching metadata key:{} value:{}, as training service is not determined.`); this.internalTrainingService = component.get(PAIK8STrainingService); diff --git a/src/nni_manager/training_service/reusable/storageService.ts b/src/nni_manager/training_service/reusable/storageService.ts index 1d08643fcf..2287e37b9f 100644 --- a/src/nni_manager/training_service/reusable/storageService.ts +++ b/src/nni_manager/training_service/reusable/storageService.ts @@ -19,11 +19,11 @@ 'use strict'; -import { uniqueString } from '../../common/utils'; import * as fs from 'fs'; import * as os from 'os'; import * as path from 'path'; -import { Logger, getLogger } from '../../common/log'; +import { getLogger, Logger } from '../../common/log'; +import { uniqueString } from '../../common/utils'; import { tarAdd } from '../common/util'; export abstract class StorageService { @@ -129,21 +129,23 @@ export abstract class StorageService { public async save(content: string, remotePath: string, isAttach: boolean = false): Promise { remotePath = this.expandPath(true, remotePath); - this.logger.debug(`saving content to remotePath: ${remotePath}, length: ${content.length}`); - const fileName = this.internalBasename(remotePath); - const tempFileName = `temp_${uniqueString(4)}_${fileName}`; - - const localTempFileName = path.join(os.tmpdir(), tempFileName); - + this.logger.debug(`saving content to remotePath: ${remotePath}, length: ${content.length}, isAttach: ${isAttach}`); const remoteDir = this.internalDirname(remotePath); - const remoteTempFile = this.internalJoin(remoteDir, tempFileName); if (isAttach) { + if (await this.internalExists(remoteDir) === false) { + await this.internalMkdir(remoteDir); + } const result = await this.internalAttach(remotePath, content); - if (false === result){ + if (false === result) { throw new Error("this.internalAttach doesn't support"); } } else { + const fileName = this.internalBasename(remotePath); + const tempFileName = `temp_${uniqueString(4)}_${fileName}`; + const localTempFileName = path.join(os.tmpdir(), tempFileName); + const remoteTempFile = this.internalJoin(remoteDir, tempFileName); + if (await this.internalExists(remotePath) === true) { await this.internalRemove(remotePath, false, false); } diff --git a/src/nni_manager/training_service/reusable/storageTrialService.ts b/src/nni_manager/training_service/reusable/storageTrialService.ts deleted file mode 100644 index 16c53c2e98..0000000000 --- a/src/nni_manager/training_service/reusable/storageTrialService.ts +++ /dev/null @@ -1,116 +0,0 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -'use strict'; - -import * as component from "../../common/component"; -import { delay, generateParamFileName } from "../../common/utils"; -import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../core/commands'; -import { encodeCommand } from "../../core/ipcInterface"; -import { EnvironmentInformation } from "./environment"; -import { StorageService } from "./storageService"; -import { TrialDetail, TrialService } from "./trial"; -import { TrialJobApplicationForm } from "../../common/trainingService"; - -@component.Singleton -export class StorageTrialService extends TrialService { - public async config(_key: string, _value: string): Promise { - return; - } - - public async refreshTrialsStatus(trials: TrialDetail[]): Promise { - const storageService = component.get(StorageService); - - for (const trial of trials) { - const currentStatus = trial.status; - // to prevent inconsistent status, skip all non running trials - if (currentStatus !== "RUNNING") { - continue; - } - - const environment = trial.environment; - if (environment === undefined) { - this.log.error(`found running trial ${trial.id} has no environment, set trial to UNKNOWN.`); - trial.status = "UNKNOWN"; - continue; - } - - let remoteFiles: string[] = []; - const codeFilePath = storageService.joinPath(trial.workingDirectory, trial.TRIAL_METADATA_DIR); - remoteFiles = await storageService.listDirectory(codeFilePath); - - if (remoteFiles.length > 0) { - let latestTimestamp = 0; - - trial.nodeExitResults = []; - for (const fileName of remoteFiles) { - if (fileName.startsWith("code")) { - const fullName = storageService.joinPath(codeFilePath, fileName) - const fileContent = await storageService.readFileContent(fullName); - - const match: RegExpMatchArray | null = fileContent.trim().match(/^-?(\d+)\s+(\d+)$/); - if (match !== null) { - const { 1: code, 2: timestamp } = match; - const intCode = parseInt(code, 10) - latestTimestamp = Math.max(latestTimestamp, parseInt(timestamp, 10)); - if (intCode === 0) { - trial.nodeExitResults.push("SUCCEEDED"); - } else { - trial.nodeExitResults.push("FAILED"); - } - } - } - } - } - } - } - - public async startTrial(trial: TrialDetail): Promise { - if (trial.environment === undefined) { - throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); - } - await this.sendCommand(NEW_TRIAL_JOB, trial.settings, trial.environment); - } - - public async stopTrial(trial: TrialDetail): Promise { - if (trial.environment === undefined) { - throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); - } - await this.sendCommand(KILL_TRIAL_JOB, trial.id, trial.environment); - } - - public async updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise { - const storageService = component.get(StorageService); - const fileName = storageService.joinPath(trial.workingDirectory, generateParamFileName(form.hyperParameters)) - - // Write file content ( parameter.cfg ) to working folders - await storageService.save(form.hyperParameters.value, fileName); - } - - private async sendCommand(commantType: string, data: any, environment: EnvironmentInformation): Promise { - let retryCount = 10; - let findingName: boolean = true; - const command = encodeCommand(commantType, JSON.stringify(data)); - const storageService = component.get(StorageService); - const fileName = storageService.joinPath(environment.workingFolder, `commands`, `manager_commands.txt`); - - // prevent to have imcomplete command, so save as temp name and then rename. - await storageService.save(command.toString("utf8"), fileName, true); - } -} diff --git a/src/nni_manager/training_service/reusable/mountedStorageService.ts b/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts similarity index 97% rename from src/nni_manager/training_service/reusable/mountedStorageService.ts rename to src/nni_manager/training_service/reusable/storages/mountedStorageService.ts index 27c3cb7034..6b0597e0a7 100644 --- a/src/nni_manager/training_service/reusable/mountedStorageService.ts +++ b/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts @@ -20,7 +20,7 @@ import * as fs from 'fs'; import * as path from 'path'; import { Deferred } from "ts-deferred"; -import { StorageService } from "./storageService"; +import { StorageService } from "../storageService"; export class MountedStorageService extends StorageService { @@ -139,7 +139,10 @@ export class MountedStorageService extends StorageService { } protected async internalAttach(remotePath: string, content: string): Promise { - await fs.promises.appendFile(remotePath, content + "\n"); + await fs.promises.appendFile(remotePath, content, { + encoding: "utf8", + flag: "a", + }); return true; } diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts index 4f4a70fe25..d9bccee59e 100644 --- a/src/nni_manager/training_service/reusable/trial.ts +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -27,8 +27,7 @@ import { GPUInfo } from "training_service/common/gpuData"; export abstract class TrialService { protected readonly log: Logger; - public abstract config(key: string, value: string): Promise; - public abstract refreshTrialsStatus(trials: TrialDetail[]): Promise; + public abstract config(key: string, value: any): Promise; public abstract updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise; public abstract startTrial(trial: TrialDetail): Promise; public abstract stopTrial(trial: TrialDetail): Promise; @@ -54,7 +53,7 @@ export class TrialDetail implements TrialJobDetail { // init settings of trial public settings = {}; // it's used to aggregate node status for multiple node trial - public nodeExitResults: TrialJobStatus[]; + public nodes: Map; // assigned GPUs for multi-trial scheduled. public assignedGpus: GPUInfo[] = []; @@ -68,6 +67,16 @@ export class TrialDetail implements TrialJobDetail { this.workingDirectory = workingDirectory; this.form = form; this.tags = []; - this.nodeExitResults = []; + this.nodes = new Map(); + } +} + +export class NodeInfomation { + public id: string; + public status: TrialJobStatus = "UNKNOWN"; + public endTime?: number; + + constructor(id: string) { + this.id = id; } } diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 2eac82a12a..96f461a08c 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -27,14 +27,18 @@ import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, getLogLevel, getVersion, uniqueString } from '../../common/utils'; +import { GPU_INFO, TRIAL_END } from '../../core/commands'; +import { GPUSummary } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { validateCodeDir } from '../common/util'; +import { FileCommandChannel } from './channels/fileCommandChannel'; +import { Command, CommandChannel } from './commandChannel'; import { EnvironmentInformation, EnvironmentService, RunnerSettings } from './environment'; import { JobRestServer } from './jobRestServer'; import { StorageService } from './storageService'; -import { TrialDetail, TrialService } from './trial'; +import { NodeInfomation, TrialDetail, TrialService } from './trial'; /** * It uses to manage jobs on training platforms @@ -55,8 +59,11 @@ class TrialDispatcher implements TrainingService { private trialConfig: TrialConfig | undefined; private runnerSettings: RunnerSettings; + private commandEmitter: EventEmitter; + private readonly trials: Map; private readonly environments: Map; + private readonly commandChannel: CommandChannel; constructor() { this.log = getLogger(); @@ -69,8 +76,10 @@ class TrialDispatcher implements TrainingService { this.runnerSettings.experimentId = this.experimentId; this.runnerSettings.platform = getPlatform(); - const logLevel = getLogLevel(); + this.commandEmitter = new EventEmitter(); + this.commandChannel = new FileCommandChannel(this.commandEmitter); + const logLevel = getLogLevel(); this.log.debug(`current folder ${__dirname}`); // different source folder in Linux and Windows if (logLevel == "debug" && (fs.existsSync("../../../src/nni_manager") || __dirname.endsWith("src\\nni_manager\\dist\\training_service\\reusable"))) { @@ -154,16 +163,28 @@ class TrialDispatcher implements TrainingService { await this.jobRestServer.start(); this.jobRestServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`Environment Manager rest server listening on: ${this.jobRestServer.endPoint}`); + this.log.info(`TrialDispatcher: rest server listening on: ${this.jobRestServer.endPoint}`); this.runnerSettings.nniManagerPort = this.jobRestServer.clusterRestServerPort; + // start channel + this.commandEmitter.on("command", (command: Command): void => { + this.handleCommand(command).catch((err: Error) => { + this.log.error(`TrialDispatcher: error on handle env ${command.environment.id} command: ${command.command}, data: ${command.data}, error: ${err}`); + }) + }); + this.commandChannel.start(); + this.log.info(`TrialDispatcher: started channel ${typeof (this.commandChannel)}`); + if (this.trialConfig === undefined) { throw new Error(`trial config shouldn't be undefined in run()`); } + const trialService = component.get(TrialService); + trialService.config("channel", this.commandChannel); + const environmentService = component.get(EnvironmentService); if (environmentService.hasStorageService) { - this.log.info(`Environment Manager copying code and settings.`); + this.log.info(`TrialDispatcher: copying code and settings.`); const storageService = component.get(StorageService); // Copy the compressed file to remoteDirectory and delete it const codeDir = path.resolve(this.trialConfig.codeDir); @@ -186,7 +207,7 @@ class TrialDispatcher implements TrainingService { } } - this.log.info(`Environment Manager run loop started.`); + this.log.info(`TrialDispatcher: run loop started.`); await Promise.all([ this.environmentMaintenanceLoop(), this.trialManagementLoop(), @@ -238,11 +259,13 @@ class TrialDispatcher implements TrainingService { this.stopping = true; const environmentService = component.get(EnvironmentService); const environments = [...this.environments.values()]; + for (let index = 0; index < environments.length; index++) { const environment = environments[index]; if (environment.isAlive === true) { this.log.info(`stopping environment ${environment.id}...`); await environmentService.stopEnvironment(environment); + await this.commandChannel.close(environment); this.log.info(`stopped environment ${environment.id}.`); } } @@ -253,17 +276,22 @@ class TrialDispatcher implements TrainingService { } catch (error) { this.log.error(`Rest server stopped failed, error: ${error.message}`); } + + this.commandEmitter.off("command", this.handleCommand); + this.commandChannel.stop(); } private async environmentMaintenanceLoop(): Promise { const environmentService = component.get(EnvironmentService); while (!this.stopping) { const environments: EnvironmentInformation[] = []; - this.environments.forEach((environment) => { + for (const environment of this.environments.values()) { if (environment.isAlive === true) { environments.push(environment); + } else { + await this.commandChannel.close(environment); } - }); + } await environmentService.refreshEnvironmentsStatus(environments); environments.forEach((environment) => { @@ -301,9 +329,6 @@ class TrialDispatcher implements TrainingService { continue; } - const trialService = component.get(TrialService); - trialService.refreshTrialsStatus(toRefreshedTrials); - const waitingTrials: TrialDetail[] = []; let liveTrialsCount = 0; for (const trial of toRefreshedTrials) { @@ -322,9 +347,10 @@ class TrialDispatcher implements TrainingService { const environmentStatus = environment.status; // any node exit, then make sure the whole trial stopped. - if (trial.nodeExitResults.length > 0) { - const completedCount = trial.nodeExitResults.length; + if (trial.nodes.size > 0) { + const completedCount = trial.nodes.size; let finalStatus: TrialJobStatus = "SUCCEEDED"; + let lastTimestamp: number | undefined; this.log.debug(`found ${completedCount} completed trial node(s), nodeCount: ${environment.nodeCount}`); // if some trial processes doesn't exit, kill it for next one. @@ -334,12 +360,22 @@ class TrialDispatcher implements TrainingService { const trialService = component.get(TrialService); await trialService.stopTrial(trial); } - for (const nodeStatus of trial.nodeExitResults) { - if (nodeStatus == "FAILED") { + for (const node of trial.nodes.values()) { + if (node.status === "FAILED") { finalStatus = "FAILED"; } + if (node.endTime !== undefined) { + if (lastTimestamp === undefined) { + lastTimestamp = node.endTime + } else { + lastTimestamp = Math.max(node.endTime, lastTimestamp); + } + } } trial.status = finalStatus; + if (lastTimestamp === undefined) { + trial.endTime = lastTimestamp; + } this.releaseEnvironment(trial); } else if (environmentStatus !== "RUNNING") { this.log.error(`found running trial ${trial.id} on '${environment.jobId}' with '${environmentStatus}', set trial to environment status.`); @@ -417,6 +453,8 @@ class TrialDispatcher implements TrainingService { environment.isIdle = true; environment.isAlive = true; } + + await this.commandChannel.open(environment); this.log.info(`requested environment ${environment.id} and job id is ${environment.jobId}.`); } @@ -452,6 +490,43 @@ class TrialDispatcher implements TrainingService { trial.environment.isIdle = true; trial.environment = undefined; } + + private async handleCommand(command: Command): Promise { + this.log.debug(`TrialDispatcher: env ${command.environment.id} received command ${command.command}, data: ${command.data}`); + const environment = command.environment; + const data = command.data; + const nodeId = data["node"]; + switch (command.command) { + case GPU_INFO: + environment.gpuSummary.set(nodeId, (data)); + break + case TRIAL_END: + { + const trialId = data["trial"]; + const trial = await this.getTrialJob(trialId); + const code = parseInt(data["code"]); + const timestamp = parseInt(data["time"]); + let exitStatus: TrialJobStatus = "SUCCEEDED"; + if (code !== 0) { + exitStatus = "FAILED"; + } + + let node: NodeInfomation | undefined; + if (trial.nodes.has(nodeId)) { + node = trial.nodes.get(nodeId); + } else { + node = new NodeInfomation(nodeId); + trial.nodes.set(nodeId, node); + } + if (undefined === node) { + throw new Error("node is impossible to be undefined (see above code), but make eslint happy!"); + } + node.status = exitStatus; + node.endTime = timestamp; + } + break + } + } } export { TrialDispatcher }; diff --git a/src/nni_manager/training_service/reusable/trials/storageTrialService.ts b/src/nni_manager/training_service/reusable/trials/storageTrialService.ts new file mode 100644 index 0000000000..fd5dfdc30f --- /dev/null +++ b/src/nni_manager/training_service/reusable/trials/storageTrialService.ts @@ -0,0 +1,69 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as component from "../../../common/component"; +import { TrialJobApplicationForm } from "../../../common/trainingService"; +import { generateParamFileName } from "../../../common/utils"; +import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../../core/commands'; +import { CommandChannel } from "../commandChannel"; +import { StorageService } from "../storageService"; +import { TrialDetail, TrialService } from "../trial"; + +@component.Singleton +export class StorageTrialService extends TrialService { + private commandChannel: CommandChannel | undefined; + + public async config(key: string, value: any): Promise { + switch (key) { + case "channel": + this.commandChannel = value; + break; + } + } + + public async startTrial(trial: TrialDetail): Promise { + if (trial.environment === undefined) { + throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); + } + if (this.commandChannel === undefined) { + throw new Error(`trialService: commandChannel shouldn't be undefined!`); + } + await this.commandChannel.sendCommand(trial.environment, NEW_TRIAL_JOB, trial.settings); + } + + public async stopTrial(trial: TrialDetail): Promise { + if (trial.environment === undefined) { + throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); + } + if (this.commandChannel === undefined) { + throw new Error(`trialService: commandChannel shouldn't be undefined!`); + } + await this.commandChannel.sendCommand(trial.environment, KILL_TRIAL_JOB, trial.id); + } + + public async updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise { + const storageService = component.get(StorageService); + const fileName = storageService.joinPath(trial.workingDirectory, generateParamFileName(form.hyperParameters)) + + // Write file content ( parameter.cfg ) to working folders + await storageService.save(form.hyperParameters.value, fileName); + } +} diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py index 9d566ef996..829e12b89a 100644 --- a/tools/nni_trial_tool/base_channel.py +++ b/tools/nni_trial_tool/base_channel.py @@ -86,6 +86,7 @@ def send(self, command, data): data: string payload. the message is sent synchronized. """ + data["node"] = self.args.node_id data = json.dumps(data) data = data.encode('utf8') message = b'%b%014d%b' % (command.value, len(data), data) diff --git a/tools/nni_trial_tool/file_channel.py b/tools/nni_trial_tool/file_channel.py index 3d728813a0..d0b6478000 100644 --- a/tools/nni_trial_tool/file_channel.py +++ b/tools/nni_trial_tool/file_channel.py @@ -50,13 +50,13 @@ def _inner_send(self, message): self.out_file.flush() def _open_manager_command(self): - manager_command_file_name = os.path.join(command_path, manager_commands_file_name) + full_name = os.path.join(command_path, manager_commands_file_name) if self.in_file is not None and self.in_file.closed: self.in_file = None - if self.in_file is None and os.path.exists(manager_command_file_name): - self.in_file = open(manager_command_file_name, "rb") + if self.in_file is None and os.path.exists(full_name): + self.in_file = open(full_name, "rb") self.in_file.seek(self.in_offset) def _inner_receive(self): diff --git a/tools/nni_trial_tool/gpu.py b/tools/nni_trial_tool/gpu.py index b7e87d7fb6..47cd70bce3 100644 --- a/tools/nni_trial_tool/gpu.py +++ b/tools/nni_trial_tool/gpu.py @@ -16,7 +16,6 @@ def collect_gpu_usage(node_id): except Exception: traceback.print_exc() info = gen_empty_gpu_metric() - info["node"] = node_id return info diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index b3dc0622be..63553a4ae4 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -12,6 +12,7 @@ import psutil from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log +from .base_channel import CommandType trial_output_path_name = ".nni" @@ -21,6 +22,7 @@ def __init__(self, args, data): self.process = None self.data = data self.args = args + self.command_channel = args.command_channel self.trial_syslogger_stdout = None global NNI_TRIAL_JOB_ID @@ -101,15 +103,13 @@ def is_running(self): retCode = ctypes.c_long(retCode).value nni_log(LogType.Info, '{0}: subprocess terminated. Exit code is {1}.'.format(self.name, retCode)) - # Exit as the retCode of subprocess(trial) - exit_code_file_name = os.path.join(self.trial_output_dir, "code") - if (self.node_id is not None): - while True: - exit_code_file_name = "%s_%s" % (exit_code_file_name, self.node_id) - if not os.path.exists(exit_code_file_name): - break - with open(exit_code_file_name, "w") as exit_file: - exit_file.write("%s %s" % (retCode, int(datetime.now().timestamp() * 1000))) + end_time = int(datetime.now().timestamp() * 1000) + end_message = { + "code": retCode, + "time": end_time, + "trial": self.id, + } + self.command_channel.send(CommandType.TrialEnd, end_message) self.cleanup() return False else: diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index e4bcb592f4..08de265be9 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -34,6 +34,7 @@ def main_loop(args): else: command_channel = FileChannel(args) nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) + args.command_channel = command_channel trial = None From 5c33d111a2a4b2d483bce52ba7bb5d23131fb511 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Sat, 20 Jun 2020 17:14:40 +0800 Subject: [PATCH 32/98] update --- .../config/aml/checkEnvieonment.py | 27 ++++++ ...{jobSubmission.py => createEnvironment.py} | 6 +- .../reusable/amlEnvironmentService.ts | 88 ++++++++++++++++++- .../training_service/reusable/environment.ts | 2 + .../reusable/trialDispatcher.ts | 24 ++++- 5 files changed, 137 insertions(+), 10 deletions(-) create mode 100644 src/nni_manager/config/aml/checkEnvieonment.py rename src/nni_manager/config/aml/{jobSubmission.py => createEnvironment.py} (91%) diff --git a/src/nni_manager/config/aml/checkEnvieonment.py b/src/nni_manager/config/aml/checkEnvieonment.py new file mode 100644 index 0000000000..84d726a91b --- /dev/null +++ b/src/nni_manager/config/aml/checkEnvieonment.py @@ -0,0 +1,27 @@ +import os +import time +from argparse import ArgumentParser +from azureml.core import Experiment, RunConfiguration, ScriptRunConfig +from azureml.core.compute import ComputeTarget +from azureml.core.run import RUNNING_STATES, RunStatus, Run +from azureml.core import Workspace +from azureml.core.conda_dependencies import CondaDependencies + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument('--subscription_id', help='the subscription id of aml') + parser.add_argument('--resource_group', help='the resource group of aml') + parser.add_argument('--workspace_name', help='the workspace name of aml') + parser.add_argument('--experiment_name', help='the experiment name') + parser.add_argument('--environment_id', help='the experiment id') + args = parser.parse_args() + + ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) + experiment = Experiment(ws, args.experiment_name) + + run_list = experiment.get_runs() + for run in run_list: + if run['runId'] == args.environment_id: + print(run['status']) + return + print('Unknown') diff --git a/src/nni_manager/config/aml/jobSubmission.py b/src/nni_manager/config/aml/createEnvironment.py similarity index 91% rename from src/nni_manager/config/aml/jobSubmission.py rename to src/nni_manager/config/aml/createEnvironment.py index cd6fbaa6a9..e3f50c1bad 100644 --- a/src/nni_manager/config/aml/jobSubmission.py +++ b/src/nni_manager/config/aml/createEnvironment.py @@ -34,8 +34,4 @@ run_config.node_count = 1 config = ScriptRunConfig(source_directory=args.code_dir, script=args.script, run_config=run_config) script_run = experiment.submit(config) - print(script_run.get_portal_url()) - while True: - time.sleep(5) - print(script_run.get_status()) - print(script_run.get_metrics()) + print(script_run.get_details()["runId"]) \ No newline at end of file diff --git a/src/nni_manager/training_service/reusable/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/amlEnvironmentService.ts index 22c7dd56c1..d945ef4856 100644 --- a/src/nni_manager/training_service/reusable/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/amlEnvironmentService.ts @@ -21,6 +21,7 @@ import * as fs from 'fs'; import * as request from 'request'; +import * as path from 'path'; import { Deferred } from 'ts-deferred'; import * as component from '../../common/component'; import { getExperimentId } from '../../common/experimentStartupInfo'; @@ -29,14 +30,15 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { AMLClusterConfig, AMLTrialConfig, AMLTrialJobDetail } from '../aml/amlConfig'; import { EnvironmentInformation, EnvironmentService } from './environment'; import { StorageService } from './storageService'; +import { PythonShell } from 'python-shell'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; import { execMkdir, validateCodeDir, execCopydir } from '../common/util'; import { - generateParamFileName, - getIPV4Address, getVersion, uniqueString + delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, + getVersion, uniqueString } from '../../common/utils'; const yaml = require('js-yaml'); @@ -57,13 +59,15 @@ export class AMLEnvironmentService implements EnvironmentService { private nniVersion?: string; private experimentId: string; private nniManagerIpConfig?: NNIManagerIpConfig; + private experimentRootDir: string; constructor() { this.experimentId = getExperimentId(); + this.experimentRootDir = getExperimentRootDir(); } public get hasStorageService(): boolean { - return true; + return false; } public async config(key: string, value: string): Promise { @@ -100,11 +104,89 @@ export class AMLEnvironmentService implements EnvironmentService { } public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { + environments.forEach((environment) => { + + if (this.amlClusterConfig === undefined) { + throw new Error('AML Cluster config is not initialized'); + } + if (this.amlTrialConfig === undefined) { + throw new Error('AML trial config is not initialized'); + } + + let pyshell = new PythonShell('createEnvironment.py', { + scriptPath: './config/aml', + pythonOptions: ['-u'], // get print results in real-time + args: [ + '--subscription_id', this.amlClusterConfig.subscriptionId, + '--resource_group', this.amlClusterConfig.resourceGroup, + '--workspace_name', this.amlClusterConfig.workspaceName, + '--computer_target', this.amlTrialConfig.computerTarget, + '--experiment_name', `nni_exp_${this.experimentId}`, + '--code_dir', environment.environmentLocalTempFolder, + '--script', 'nni_script.py' + ] + }); + pyshell.on('message', function (status: any) { + // received a message sent from the Python script (a simple "print" statement) + console.log(`update status ${status}`); + switch (status.toUpperCase()) { + case 'QUEUED': + environment.status = 'WAITING'; + break; + case 'WAITING': + case 'RUNNING': + case 'SUCCEEDED': + case 'FAILED': + environment.status = status; + break; + case 'STOPPED': + case 'STOPPING': + environment.status = 'USER_CANCELED'; + break; + default: + environment.status = 'UNKNOWN'; + } + }); + }); + return; } public async startEnvironment(environment: EnvironmentInformation): Promise { + const deferred: Deferred = new Deferred(); + + if (this.amlClusterConfig === undefined) { + throw new Error('AML Cluster config is not initialized'); + } + if (this.amlTrialConfig === undefined) { + throw new Error('AML trial config is not initialized'); + } + //TODO: use temp folder + //let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId); + await fs.promises.writeFile(path.join(environment.environmentLocalTempFolder, 'nni_script.py'), environment.command ,{ encoding: 'utf8' }); + + let pyshell = new PythonShell('createEnvironment.py', { + scriptPath: './config/aml', + pythonOptions: ['-u'], // get print results in real-time + args: [ + '--subscription_id', this.amlClusterConfig.subscriptionId, + '--resource_group', this.amlClusterConfig.resourceGroup, + '--workspace_name', this.amlClusterConfig.workspaceName, + '--computer_target', this.amlTrialConfig.computerTarget, + '--docker_image', this.amlTrialConfig.image, + '--experiment_name', `nni_exp_${this.experimentId}`, + '--code_dir', environment.environmentLocalTempFolder, + '--script', 'nni_script.py' + ] + }); + pyshell.on('message', function (envId: any) { + // received a message sent from the Python script (a simple "print" statement) + console.log(envId); + environment.id = envId; + }); + return deferred.resolve(); } public async stopEnvironment(environment: EnvironmentInformation): Promise { + } } diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 0f9b42c1c6..890ceb386d 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -70,6 +70,8 @@ export class EnvironmentInformation { public command: string = ""; public nodeCount: number = 1; + public environmentLocalTempFolder: string = ""; + public gpuSummary: GPUSummary | undefined; constructor(id: string, jobName: string, jobId?: string) { diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 2eac82a12a..902d0f77c7 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -26,11 +26,11 @@ import * as component from '../../common/component'; import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; -import { delay, getLogLevel, getVersion, uniqueString } from '../../common/utils'; +import { delay, getLogLevel, getVersion, uniqueString, getExperimentRootDir } from '../../common/utils'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { validateCodeDir } from '../common/util'; +import { validateCodeDir, execCopydir, execMkdir } from '../common/util'; import { EnvironmentInformation, EnvironmentService, RunnerSettings } from './environment'; import { JobRestServer } from './jobRestServer'; import { StorageService } from './storageService'; @@ -51,6 +51,7 @@ class TrialDispatcher implements TrainingService { private readonly metricsEmitter: EventEmitter; private versionCheck: boolean = true; private readonly experimentId: string; + private readonly experimentRootDir: string; private trialConfig: TrialConfig | undefined; private runnerSettings: RunnerSettings; @@ -65,6 +66,8 @@ class TrialDispatcher implements TrainingService { this.metricsEmitter = new EventEmitter(); this.jobRestServer = new JobRestServer(this.metricsEmitter); this.experimentId = getExperimentId(); + this.experimentRootDir = getExperimentRootDir(); + this.runnerSettings = new RunnerSettings(); this.runnerSettings.experimentId = this.experimentId; this.runnerSettings.platform = getPlatform(); @@ -394,6 +397,10 @@ class TrialDispatcher implements TrainingService { const name = `nni_exp_${this.experimentId}_env_${envId}`; const environment = new EnvironmentInformation(envId, name); + if (this.trialConfig === undefined) { + throw new Error(`trial config shouldn't be undefined in run()`); + } + environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; if (this.isDeveloping) { @@ -404,6 +411,19 @@ class TrialDispatcher implements TrainingService { const storageService = component.get(StorageService); environment.workingFolder = storageService.joinPath("envs", envId); await storageService.createDirectory(environment.workingFolder); + } else { + //write configuration to local folder, for AML + let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp", envId); + await execMkdir(environmentLocalTempFolder); + const runnerSettingsPath = path.join(environmentLocalTempFolder, "settings.json"); + await fs.promises.writeFile(runnerSettingsPath, JSON.stringify(this.runnerSettings), { encoding: 'utf8' }); + const installFilePath = path.join(environmentLocalTempFolder, "install_nni.sh"); + await fs.promises.writeFile(installFilePath, CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); + environment.command = `import os\nos.system('sh install_nni.sh && python3 -m nni_trial_tool.trial_runner')`; + environment.environmentLocalTempFolder = environmentLocalTempFolder; + let environmentLocalTempTrialFolder = path.join(environmentLocalTempFolder, 'code'); + await execMkdir(environmentLocalTempTrialFolder); + await execCopydir(this.trialConfig.codeDir, environmentLocalTempTrialFolder); } this.environments.set(environment.id, environment); From 45424e838c3d0529b0c4f3388c0284e5fd611b0d Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 22 Jun 2020 13:10:49 +0800 Subject: [PATCH 33/98] support concurrent trials in runner. --- .../training_service/reusable/environment.ts | 7 ++-- .../reusable/trialDispatcher.ts | 2 +- tools/nni_trial_tool/base_channel.py | 13 ++++++-- tools/nni_trial_tool/gpu.py | 1 - tools/nni_trial_tool/trial_runner.py | 33 +++++++++++++------ 5 files changed, 38 insertions(+), 18 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index e9ca1f6be8..735e08a78a 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -23,13 +23,14 @@ import { GPUSummary } from "training_service/common/gpuData"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; +export type Channel = "rest" | "file" | "aml" export abstract class EnvironmentService { public abstract get hasStorageService(): boolean; public abstract config(key: string, value: string): Promise; - public abstract refreshEnvironmentsStatus(environment: EnvironmentInformation[]): Promise; + public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; public abstract startEnvironment(environment: EnvironmentInformation): Promise; public abstract stopEnvironment(environment: EnvironmentInformation): Promise; } @@ -45,8 +46,8 @@ export class RunnerSettings { public enableGpuCollector: boolean = false; // specify which communication channel is used by runner. - // supported channel includes: api, storage, aml - public commandChannel: string = "file"; + // supported channel includes: rest, storage, aml + public commandChannel: Channel = "file"; } export class EnvironmentInformation { diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 96f461a08c..e56fe6d0de 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -433,7 +433,7 @@ class TrialDispatcher implements TrainingService { environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; if (this.isDeveloping) { - environment.command = "mkdir ./nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool &&" + environment.command; + environment.command = "[ -d \"nni_trial_tool\" ] && echo \"nni_trial_tool exists already\" || (mkdir ./nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool) &&" + environment.command; } if (environmentService.hasStorageService) { diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py index 829e12b89a..059449c02c 100644 --- a/tools/nni_trial_tool/base_channel.py +++ b/tools/nni_trial_tool/base_channel.py @@ -38,6 +38,7 @@ class BaseChannel(ABC): def __init__(self, args): self.is_keep_parsed = args.node_count > 1 self.args = args + self.node_id = self.args.node_id # initialize receive, send threads. self.is_running = True @@ -74,7 +75,10 @@ def _send_loop(self): # do nothing, if no command received. pass if message is not None: - nni_log(LogType.Info, 'Sending command: %s' % message) + if self.node_id is None: + nni_log(LogType.Info, 'Sending command: %s' % message) + else: + nni_log(LogType.Info, 'Sending command(%s): %s' % (self.node_id, message)) self._inner_send(message) def close(self): @@ -86,7 +90,7 @@ def send(self, command, data): data: string payload. the message is sent synchronized. """ - data["node"] = self.args.node_id + data["node"] = self.node_id data = json.dumps(data) data = data.encode('utf8') message = b'%b%014d%b' % (command.value, len(data), data) @@ -121,7 +125,10 @@ def receive(self): return None, None data = command_content[16:16+length] data = json.loads(data.decode('utf8')) - nni_log(LogType.Info, 'Received command, header: [%s], data: [%s]' % (header, data)) + if self.node_id is None: + nni_log(LogType.Info, 'Received command, header: [%s], data: [%s]' % (header, data)) + else: + nni_log(LogType.Info, 'Received command(%s), header: [%s], data: [%s]' % (self.node_id, header, data)) except Empty: # do nothing, if no command received. pass diff --git a/tools/nni_trial_tool/gpu.py b/tools/nni_trial_tool/gpu.py index 47cd70bce3..48dab4b182 100644 --- a/tools/nni_trial_tool/gpu.py +++ b/tools/nni_trial_tool/gpu.py @@ -52,7 +52,6 @@ def parse_nvidia_smi_result(smi): output["gpuInfos"].append(gpuInfo) except Exception: - # e_info = sys.exc_info() traceback.print_exc() output = {} return output diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 08de265be9..dbbacc767e 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -29,37 +29,42 @@ def main_loop(args): # init command channel command_channel = None - if args.command_channel == "api": + if args.command_channel == "rest": command_channel = FileChannel(args) else: command_channel = FileChannel(args) nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) args.command_channel = command_channel - trial = None + trials = dict() try: # command loop while True: command_type, command_data = command_channel.receive() if command_type == CommandType.NewTrialJob: - if trial is not None: + trial_id = command_data["trialId"] + if trial_id in trials.keys(): if trial.is_running(): raise Exception('trial %s is running already, cannot start a new one' % trial.id) else: - trial = None + del trials[trial_id] trial = Trial(args, command_data) trial.run() + trials[trial_id] = trial elif command_type == CommandType.KillTrialJob: - if trial is not None: + trial_id = command_data + if trial_id in trials.keys(): trial.kill(command_data) elif command_type is not None: raise Exception("unknown command %s" % command_type) - if trial is not None and trial.is_running(): - idle_last_time = datetime.now() - else: - trial = None + trial_list = list(trials.values()) + for trial in trial_list: + if trial is not None and trial.is_running(): + idle_last_time = datetime.now() + else: + del trials[trial.id] if (datetime.now() - idle_last_time).seconds > idle_timeout_seconds: nni_log(LogType.Info, "trial runner is idle more than {0} seconds, so exit.".format( @@ -76,8 +81,16 @@ def main_loop(args): traceback.print_exc() finally: nni_log(LogType.Info, "main_loop exits.") - if trial is not None: + + trial_list = list(trials.values()) + for trial in trial_list: trial.kill() + del trials[trial.id] + # wait to send commands + for i in range(10): + if command_channel.sent(): + break + time.sleep(1) command_channel.close() From 9ca344484a87ddb824b287cfafe5eb8c28490c15 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Wed, 24 Jun 2020 13:38:13 +0800 Subject: [PATCH 34/98] implement web channel other changes add initialized message of node. So running status is set by runner, not from env. --- deployment/pypi/setup.py | 3 +- setup.py | 3 +- src/nni_manager/common/restServer.ts | 4 +- src/nni_manager/core/commands.ts | 8 +- src/nni_manager/package.json | 4 +- .../reusable/channels/fileCommandChannel.ts | 64 ++++---- .../reusable/channels/webCommandChannel.ts | 144 ++++++++++++++++++ .../reusable/commandChannel.ts | 75 +++++++-- .../training_service/reusable/environment.ts | 33 ++++ .../environments/openPaiEnvironmentService.ts | 10 +- .../reusable/jobRestServer.ts | 6 + .../training_service/reusable/trial.ts | 12 +- .../reusable/trialDispatcher.ts | 60 ++++++-- src/nni_manager/yarn.lock | 121 ++------------- src/sdk/pynni/requirements.txt | 1 + tools/nni_trial_tool/base_channel.py | 102 +++++++++---- tools/nni_trial_tool/file_channel.py | 21 +-- tools/nni_trial_tool/trial_runner.py | 58 +++---- tools/nni_trial_tool/web_channel.py | 52 +++++++ tools/setup.py | 3 +- 20 files changed, 512 insertions(+), 272 deletions(-) create mode 100644 src/nni_manager/training_service/reusable/channels/webCommandChannel.ts create mode 100644 tools/nni_trial_tool/web_channel.py diff --git a/deployment/pypi/setup.py b/deployment/pypi/setup.py index 3c2d433790..6297970c99 100644 --- a/deployment/pypi/setup.py +++ b/deployment/pypi/setup.py @@ -62,7 +62,8 @@ 'scipy', 'coverage', 'colorama', - 'scikit-learn>=0.20,<0.22' + 'scikit-learn>=0.20,<0.22', + 'websockets' ], classifiers = [ 'Programming Language :: Python :: 3', diff --git a/setup.py b/setup.py index 8a3733776f..aee7105cda 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,8 @@ def read(fname): 'schema', 'PythonWebHDFS', 'colorama', - 'scikit-learn>=0.20,<0.22' + 'scikit-learn>=0.20,<0.22', + 'websockets' ], entry_points = { diff --git a/src/nni_manager/common/restServer.ts b/src/nni_manager/common/restServer.ts index 368aff977c..ed88e2d476 100644 --- a/src/nni_manager/common/restServer.ts +++ b/src/nni_manager/common/restServer.ts @@ -19,9 +19,9 @@ import { getBasePort } from './experimentStartupInfo'; export abstract class RestServer { private startTask!: Deferred; private stopTask!: Deferred; - private server!: http.Server; - + /** The fields can be inherited by subclass */ + protected server!: http.Server; protected hostName: string = '0.0.0.0'; protected port?: number; protected app: express.Application = express(); diff --git a/src/nni_manager/core/commands.ts b/src/nni_manager/core/commands.ts index 6e308808ec..575f492f14 100644 --- a/src/nni_manager/core/commands.ts +++ b/src/nni_manager/core/commands.ts @@ -21,11 +21,15 @@ const NO_MORE_TRIAL_JOBS = 'NO'; const KILL_TRIAL_JOB = 'KI'; const TRIAL_COMMANDS: Set = new Set([ + // from ctl to node NEW_TRIAL_JOB, - TRIAL_END, SEND_TRIAL_JOB_PARAMETER, - GPU_INFO, KILL_TRIAL_JOB, + + // from node to ctl + INITIALIZED, + TRIAL_END, + GPU_INFO, ]); const TUNER_COMMANDS: Set = new Set([ diff --git a/src/nni_manager/package.json b/src/nni_manager/package.json index 34aa0b0121..dfd51cd6f5 100644 --- a/src/nni_manager/package.json +++ b/src/nni_manager/package.json @@ -27,7 +27,8 @@ "ts-deferred": "^1.0.4", "typescript-ioc": "^1.2.4", "typescript-string-operations": "^1.3.1", - "webhdfs": "^1.2.0" + "webhdfs": "^1.2.0", + "ws": "^7.3.0" }, "devDependencies": { "@types/chai": "^4.1.4", @@ -43,6 +44,7 @@ "@types/ssh2": "^0.5.35", "@types/stream-buffers": "^3.0.2", "@types/tmp": "^0.0.33", + "@types/ws": "^7.2.5", "@typescript-eslint/eslint-plugin": "^2.10.0", "@typescript-eslint/parser": "^2.10.0", "chai": "^4.1.2", diff --git a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts index 29ac2171a7..a5e0aebd74 100644 --- a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts @@ -21,8 +21,8 @@ import * as component from "../../../common/component"; import { delay } from "../../../common/utils"; -import { CommandChannel } from "../commandChannel"; -import { EnvironmentInformation } from "../environment"; +import { CommandChannel, RunnerConnection } from "../commandChannel"; +import { EnvironmentInformation, Channel } from "../environment"; import { StorageService } from "../storageService"; class FileHandler { @@ -35,22 +35,23 @@ class FileHandler { } -class EnvironmentHandler { - public environment: EnvironmentInformation; +class FileRunnerConnection extends RunnerConnection { public handlers: Map = new Map(); - - constructor(environment: EnvironmentInformation) { - this.environment = environment; - } } export class FileCommandChannel extends CommandChannel { private readonly commandPath = "commands"; private stopping: boolean = false; - // each node have a receiver - private receive_handlers: Map = new Map(); // make sure no concurrent issue when sending commands. - private send_queues: [EnvironmentInformation, string][] = []; + private sendQueues: [EnvironmentInformation, string][] = []; + + public get channelName(): Channel { + return "file"; + } + + public async config(_key: string, _value: any): Promise { + // do nothing + } public start(): void { // start command loops @@ -62,21 +63,12 @@ export class FileCommandChannel extends CommandChannel { this.stopping = true; } - public async open(environment: EnvironmentInformation): Promise { - if (this.receive_handlers.has(environment.id)) { - throw new Error(`FileCommandChannel: env ${environment.id} is opened already, shouldn't be opened again.`); - } - this.receive_handlers.set(environment.id, new EnvironmentHandler(environment)); - } - - public async close(environment: EnvironmentInformation): Promise { - if (this.receive_handlers.has(environment.id)) { - this.receive_handlers.delete(environment.id); - } + protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise { + this.sendQueues.push([environment, message]); } - protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise { - this.send_queues.push([environment, message]); + protected createRunnerConnection(environment: EnvironmentInformation): RunnerConnection { + return new FileRunnerConnection(environment); } private async sendLoop(): Promise { @@ -84,11 +76,11 @@ export class FileCommandChannel extends CommandChannel { while (!this.stopping) { const start = new Date(); - if (this.send_queues.length > 0) { + if (this.sendQueues.length > 0) { const storageService = component.get(StorageService); - while (this.send_queues.length > 0) { - const item = this.send_queues.shift(); + while (this.sendQueues.length > 0) { + const item = this.sendQueues.shift(); if (item === undefined) { break; } @@ -115,16 +107,16 @@ export class FileCommandChannel extends CommandChannel { while (!this.stopping) { const start = new Date(); - const envs = [...this.receive_handlers.values()]; - for (const environmentHandler of envs) { - const envCommandFolder = storageService.joinPath(environmentHandler.environment.workingFolder, this.commandPath); + const runnerConnections = [...this.runnerConnections.values()] as FileRunnerConnection[]; + for (const runnerConnection of runnerConnections) { + const envCommandFolder = storageService.joinPath(runnerConnection.environment.workingFolder, this.commandPath); // open new command files - if (environmentHandler.handlers.size < environmentHandler.environment.nodeCount) { + if (runnerConnection.handlers.size < runnerConnection.environment.nodeCount) { // to find all node commands file const commandFileNames = await storageService.listDirectory(envCommandFolder); const toAddedFileNames = []; for (const commandFileName of commandFileNames) { - if (commandFileName.startsWith("runner_commands") && !environmentHandler.handlers.has(commandFileName)) { + if (commandFileName.startsWith("runner_commands") && !runnerConnection.handlers.has(commandFileName)) { toAddedFileNames.push(commandFileName); } } @@ -132,18 +124,18 @@ export class FileCommandChannel extends CommandChannel { for (const toAddedFileName of toAddedFileNames) { const fullPath = storageService.joinPath(envCommandFolder, toAddedFileName); const fileHandler: FileHandler = new FileHandler(fullPath); - environmentHandler.handlers.set(toAddedFileName, fileHandler); - this.log.debug(`FileCommandChannel: added fileHandler env ${environmentHandler.environment.id} ${toAddedFileName}`); + runnerConnection.handlers.set(toAddedFileName, fileHandler); + this.log.debug(`FileCommandChannel: added fileHandler env ${runnerConnection.environment.id} ${toAddedFileName}`); } } // to loop all commands - for (const fileHandler of environmentHandler.handlers.values()) { + for (const fileHandler of runnerConnection.handlers.values()) { const newContent = await storageService.readFileContent(fileHandler.fileName, fileHandler.offset, undefined); if (newContent.length > 0) { const commands = newContent.split('\n'); for (const command of commands) { - this.handleCommand(environmentHandler.environment, command); + this.handleCommand(runnerConnection.environment, command); } fileHandler.offset += newContent.length; } diff --git a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts new file mode 100644 index 0000000000..cdb612e39b --- /dev/null +++ b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts @@ -0,0 +1,144 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import { Server as HttpServer } from 'http'; +import { Server as SocketServer } from "ws"; +import { getExperimentId } from "../../../common/experimentStartupInfo"; +import { INITIALIZED } from '../../../core/commands'; +import { CommandChannel, RunnerConnection } from "../commandChannel"; +import { Channel, EnvironmentInformation } from "../environment"; + +class WebRunnerConnection extends RunnerConnection { + public readonly clients: WebSocket[] = []; + + public async close(): Promise { + await super.close(); + while (this.clients.length > 0) { + const client = this.clients.shift(); + if (client !== undefined) { + client.close(); + } + } + } + + public AddClient(client: WebSocket): void { + this.clients.push(client); + } +} + +export class WebCommandChannel extends CommandChannel { + private readonly expId: string = getExperimentId(); + + private httpServer: HttpServer | undefined; + private webSocketServer: SocketServer | undefined; + private clients: Map = new Map(); + + public get channelName(): Channel { + return "rest"; + } + + public async config(key: string, value: any): Promise { + switch (key) { + case "RestServer": + this.httpServer = value as HttpServer; + break; + } + } + + public start(): void { + if (this.httpServer === undefined) { + throw new Error(`http server is not initialized!`); + } + + const server = this.httpServer; + this.webSocketServer = new SocketServer({ server }); + + this.webSocketServer.on('connection', (client: WebSocket) => { + this.log.debug(`WebCommandChannel: received connection`); + + this.clients.set(client, undefined); + client.onmessage = (message): void => { + this.receivedWebSocketMessage(client, message); + }; + }); + } + + public stop(): void { + if (this.webSocketServer !== undefined) { + this.webSocketServer.close(); + } + } + + protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise { + if (this.webSocketServer === undefined) { + throw new Error(`WebCommandChannel: uninitialized!`) + } + const runnerConnection = this.runnerConnections.get(environment.id) as WebRunnerConnection; + if (runnerConnection !== undefined) { + for (const client of runnerConnection.clients) { + client.send(message); + } + } else { + this.log.warning(`WebCommandChannel: cannot find client for env ${environment.id}, message is ignored.`); + } + } + + protected createRunnerConnection(environment: EnvironmentInformation): RunnerConnection { + return new WebRunnerConnection(environment); + } + + private receivedWebSocketMessage(client: WebSocket, message: MessageEvent): void { + let connection = this.clients.get(client) as WebRunnerConnection | undefined; + const rawCommands = message.data.toString(); + + if (connection === undefined) { + // undefined means it's expecting initializing message. + const commands = this.parseCommands(rawCommands); + let isValid = false; + this.log.debug(`WebCommandChannel: received initialize message: ${JSON.stringify(rawCommands)}`); + + if (commands.length > 0) { + const commandType = commands[0][0]; + const result = commands[0][1]; + if (commandType === INITIALIZED && + result.expId === this.expId && + this.runnerConnections.has(result.runnerId) + ) { + const runnerConnection = this.runnerConnections.get(result.runnerId) as WebRunnerConnection; + this.clients.set(client, runnerConnection); + runnerConnection.AddClient(client); + connection = runnerConnection; + isValid = true; + } + } + + if (!isValid) { + this.log.warning(`WebCommandChannel: rejected client with invalid init message ${rawCommands}`); + client.close(); + this.clients.delete(client); + } + } + + if (connection !== undefined) { + this.handleCommand(connection.environment, rawCommands); + } + } +} diff --git a/src/nni_manager/training_service/reusable/commandChannel.ts b/src/nni_manager/training_service/reusable/commandChannel.ts index 29da8781b8..96756e67a1 100644 --- a/src/nni_manager/training_service/reusable/commandChannel.ts +++ b/src/nni_manager/training_service/reusable/commandChannel.ts @@ -22,16 +22,11 @@ import { EventEmitter } from "events"; import { TRIAL_COMMANDS } from "../../core/commands"; import { encodeCommand } from "../../core/ipcInterface"; -import { EnvironmentInformation } from "./environment"; +import { EnvironmentInformation, Channel } from "./environment"; import { Logger, getLogger } from "../../common/log"; const acceptedCommands: Set = new Set(TRIAL_COMMANDS); -export enum ChannelType { - API = "api", - Storage = "storage", -} - export class Command { public readonly environment: EnvironmentInformation; public readonly command: string; @@ -47,10 +42,27 @@ export class Command { } } +export abstract class RunnerConnection { + public readonly environment: EnvironmentInformation; + + constructor(environment: EnvironmentInformation) { + this.environment = environment; + } + + public async open(): Promise { + // do nothing + } + + public async close(): Promise { + // do nothing + } +} + export abstract class CommandChannel { protected readonly log: Logger; - + protected runnerConnections: Map = new Map(); protected readonly commandEmitter: EventEmitter; + private readonly commandPattern: RegExp = /(?[\w]{2})(?[\d]{14})(?.*)\n?/gm; public constructor(commandEmitter: EventEmitter) { @@ -58,21 +70,42 @@ export abstract class CommandChannel { this.commandEmitter = commandEmitter; } + public abstract get channelName(): Channel; + public abstract config(key: string, value: any): Promise; public abstract start(): void; public abstract stop(): void; - public abstract open(environment: EnvironmentInformation): Promise; - public abstract close(environment: EnvironmentInformation): Promise; - protected abstract sendCommandInternal(environment: EnvironmentInformation, message: string): Promise; + protected abstract createRunnerConnection(environment: EnvironmentInformation): RunnerConnection; public async sendCommand(environment: EnvironmentInformation, commantType: string, data: any): Promise { const command = encodeCommand(commantType, JSON.stringify(data)); + this.log.debug(`CommandChannel: env ${environment.id} sending command: ${command}`); await this.sendCommandInternal(environment, command.toString("utf8")); - this.log.debug(`CommandChannel: env ${environment.id} sent command: ${command}`); } - protected handleCommand(environment: EnvironmentInformation, content: string): void { + public async open(environment: EnvironmentInformation): Promise { + if (this.runnerConnections.has(environment.id)) { + throw new Error(`CommandChannel: env ${environment.id} is opened already, shouldn't be opened again.`); + } + const connection = this.createRunnerConnection(environment); + this.runnerConnections.set(environment.id, connection); + await connection.open(); + } + + public async close(environment: EnvironmentInformation): Promise { + if (this.runnerConnections.has(environment.id)) { + const connection = this.runnerConnections.get(environment.id); + this.runnerConnections.delete(environment.id); + if (connection !== undefined) { + await connection.close(); + } + } + } + + protected parseCommands(content: string): [string, any][] { + const commands: [string, any][] = []; + let matches = this.commandPattern.exec(content); while (matches) { @@ -86,11 +119,23 @@ export abstract class CommandChannel { // to handle encode('utf8') of Python data = JSON.parse('"' + data.split('"').join('\\"') + '"'); const finalData = JSON.parse(data); - const command = new Command(environment, commandType, finalData); - this.commandEmitter.emit("command", command); - this.log.debug(`CommandChannel: env ${environment.id} emit command: ${commandType}, ${data}`); + commands.push([commandType, finalData]); } matches = this.commandPattern.exec(content); } + + return commands; + } + + protected handleCommand(environment: EnvironmentInformation, content: string): void { + const parsedResults = this.parseCommands(content); + + for (const parsedResult of parsedResults) { + const commandType = parsedResult[0]; + const data = parsedResult[1]; + const command = new Command(environment, commandType, data); + this.commandEmitter.emit("command", command); + this.log.trace(`CommandChannel: env ${environment.id} emit command: ${commandType}, ${data}`); + } } } diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 735e08a78a..daf03f946f 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -20,6 +20,8 @@ 'use strict'; import { GPUSummary } from "training_service/common/gpuData"; +import { getLogger, Logger } from "../../common/log"; +import { TrialJobStatus } from "../../common/trainingService"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; @@ -35,6 +37,16 @@ export abstract class EnvironmentService { public abstract stopEnvironment(environment: EnvironmentInformation): Promise; } +export class NodeInfomation { + public id: string; + public status: TrialJobStatus = "UNKNOWN"; + public endTime?: number; + + constructor(id: string) { + this.id = id; + } +} + export class RunnerSettings { public experimentId: string = ""; public platform: string = ""; @@ -51,6 +63,8 @@ export class RunnerSettings { } export class EnvironmentInformation { + private log: Logger; + // NNI environment ID public id: string; // training platform unique job ID. @@ -63,6 +77,7 @@ export class EnvironmentInformation { public isIdle: boolean = false; // true: environment is running, waiting, or unknown. public isAlive: boolean = true; + // don't set status in environment directly, use setFinalState function to set a final state. public status: EnvironmentStatus = "UNKNOWN"; public trackingUrl: string = ""; @@ -71,11 +86,29 @@ export class EnvironmentInformation { public command: string = ""; public nodeCount: number = 1; + // it's used to aggregate node status for multiple node trial + public nodes: Map; public gpuSummary: Map = new Map(); constructor(id: string, jobName: string, jobId?: string) { + this.log = getLogger(); this.id = id; this.jobName = jobName; this.jobId = jobId ? jobId : jobName; + this.nodes = new Map(); + } + + public setFinalStatus(status: EnvironmentStatus): void { + switch (status) { + case 'WAITING': + case 'SUCCEEDED': + case 'FAILED': + case 'USER_CANCELED': + this.status = status; + break; + default: + this.log.error(`Environment: job ${this.jobId} set an invalid final state ${status}.`); + break; + } } } diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 6895cc3632..9c8662242f 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -134,19 +134,21 @@ export class OpenPaiEnvironmentService implements EnvironmentService { if (jobResponse && jobResponse.state) { const oldEnvironmentStatus = environment.status; switch (jobResponse.state) { - case 'WAITING': case 'RUNNING': + // RUNNING state is set by runner. + break; + case 'WAITING': case 'SUCCEEDED': case 'FAILED': - environment.status = jobResponse.state; + environment.setFinalStatus(jobResponse.state); break; case 'STOPPED': case 'STOPPING': - environment.status = 'USER_CANCELED'; + environment.setFinalStatus('USER_CANCELED'); break; default: this.log.error(`OpenPAI: job ${environment.jobId} returns unknown state ${jobResponse.state}.`); - environment.status = 'UNKNOWN'; + environment.setFinalStatus('UNKNOWN'); } if (oldEnvironmentStatus !== environment.status) { this.log.debug(`OpenPAI: job ${environment.jobId} change status ${oldEnvironmentStatus} to ${environment.status} due to job is ${jobResponse.state}.`) diff --git a/src/nni_manager/training_service/reusable/jobRestServer.ts b/src/nni_manager/training_service/reusable/jobRestServer.ts index 51e897b8ab..a0871d9ade 100644 --- a/src/nni_manager/training_service/reusable/jobRestServer.ts +++ b/src/nni_manager/training_service/reusable/jobRestServer.ts @@ -22,6 +22,7 @@ import { EventEmitter } from 'events'; import { Request, Response, Router } from 'express'; import { ClusterJobRestServer } from '../common/clusterJobRestServer'; +import { Server } from 'http'; export interface ParameterFileMeta { readonly experimentId: string; @@ -46,6 +47,11 @@ export class JobRestServer extends ClusterJobRestServer { this.setEnableVersionCheck = true; } + public get Server(): Server { + return this.server; + } + + protected handleTrialMetrics(jobId: string, metrics: any[]): void { // Split metrics array into single metric, then emit // Warning: If not split metrics into single ones, the behavior will be UNKNOWN diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts index d9bccee59e..396d912ff9 100644 --- a/src/nni_manager/training_service/reusable/trial.ts +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -21,7 +21,7 @@ import { Logger, getLogger } from "../../common/log"; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../common/trainingService"; -import { EnvironmentInformation } from "./environment"; +import { EnvironmentInformation, NodeInfomation } from "./environment"; import { GPUInfo } from "training_service/common/gpuData"; export abstract class TrialService { @@ -70,13 +70,3 @@ export class TrialDetail implements TrialJobDetail { this.nodes = new Map(); } } - -export class NodeInfomation { - public id: string; - public status: TrialJobStatus = "UNKNOWN"; - public endTime?: number; - - constructor(id: string) { - this.id = id; - } -} diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index e56fe6d0de..0f38c3f85e 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -27,18 +27,18 @@ import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, getLogLevel, getVersion, uniqueString } from '../../common/utils'; -import { GPU_INFO, TRIAL_END } from '../../core/commands'; +import { GPU_INFO, INITIALIZED, TRIAL_END } from '../../core/commands'; import { GPUSummary } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { validateCodeDir } from '../common/util'; -import { FileCommandChannel } from './channels/fileCommandChannel'; +import { WebCommandChannel } from './channels/webCommandChannel'; import { Command, CommandChannel } from './commandChannel'; -import { EnvironmentInformation, EnvironmentService, RunnerSettings } from './environment'; +import { EnvironmentInformation, EnvironmentService, RunnerSettings, NodeInfomation } from './environment'; import { JobRestServer } from './jobRestServer'; import { StorageService } from './storageService'; -import { NodeInfomation, TrialDetail, TrialService } from './trial'; +import { TrialDetail, TrialService } from './trial'; /** * It uses to manage jobs on training platforms @@ -77,7 +77,7 @@ class TrialDispatcher implements TrainingService { this.runnerSettings.platform = getPlatform(); this.commandEmitter = new EventEmitter(); - this.commandChannel = new FileCommandChannel(this.commandEmitter); + this.commandChannel = new WebCommandChannel(this.commandEmitter); const logLevel = getLogLevel(); this.log.debug(`current folder ${__dirname}`); @@ -165,7 +165,10 @@ class TrialDispatcher implements TrainingService { this.jobRestServer.setEnableVersionCheck = this.versionCheck; this.log.info(`TrialDispatcher: rest server listening on: ${this.jobRestServer.endPoint}`); this.runnerSettings.nniManagerPort = this.jobRestServer.clusterRestServerPort; + this.runnerSettings.commandChannel = this.commandChannel.channelName; + // for restful api, other channel can ignore this. + this.commandChannel.config("RestServer", this.jobRestServer.Server); // start channel this.commandEmitter.on("command", (command: Command): void => { this.handleCommand(command).catch((err: Error) => { @@ -173,7 +176,7 @@ class TrialDispatcher implements TrainingService { }) }); this.commandChannel.start(); - this.log.info(`TrialDispatcher: started channel ${typeof (this.commandChannel)}`); + this.log.info(`TrialDispatcher: started channel: ${this.commandChannel.constructor.name}`); if (this.trialConfig === undefined) { throw new Error(`trial config shouldn't be undefined in run()`); @@ -433,7 +436,7 @@ class TrialDispatcher implements TrainingService { environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; if (this.isDeveloping) { - environment.command = "[ -d \"nni_trial_tool\" ] && echo \"nni_trial_tool exists already\" || (mkdir ./nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool) &&" + environment.command; + environment.command = "[ -d \"nni_trial_tool\" ] && echo \"nni_trial_tool exists already\" || (mkdir ./nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool) && pip3 install websockets && " + environment.command; } if (environmentService.hasStorageService) { @@ -499,7 +502,40 @@ class TrialDispatcher implements TrainingService { switch (command.command) { case GPU_INFO: environment.gpuSummary.set(nodeId, (data)); - break + break; + case INITIALIZED: + const oldStatus = environment.status; + let isAllReady = true; + + if (environment.nodeCount > 1) { + let node = environment.nodes.get(nodeId); + if (node === undefined) { + node = new NodeInfomation(nodeId); + environment.nodes.set(nodeId, node); + } + const oldNodeStatus = node.status; + if (oldNodeStatus === "UNKNOWN") { + node.status = "RUNNING"; + } + + if (environment.nodes.size === environment.nodeCount) { + for (const node of environment.nodes.values()) { + if (node.status !== "RUNNING") { + isAllReady = false; + break; + } + } + } else { + isAllReady = false; + } + } + + // single node is always ready to set env status + if (isAllReady && oldStatus === "UNKNOWN") { + environment.status = "RUNNING"; + this.log.info(`TrialDispatcher: env ${environment.id} received initialized message, old status: ${oldStatus}, new status: ${environment.status}.`); + } + break; case TRIAL_END: { const trialId = data["trial"]; @@ -511,10 +547,8 @@ class TrialDispatcher implements TrainingService { exitStatus = "FAILED"; } - let node: NodeInfomation | undefined; - if (trial.nodes.has(nodeId)) { - node = trial.nodes.get(nodeId); - } else { + let node = environment.nodes.get(nodeId); + if (node === undefined) { node = new NodeInfomation(nodeId); trial.nodes.set(nodeId, node); } @@ -524,7 +558,7 @@ class TrialDispatcher implements TrainingService { node.status = exitStatus; node.endTime = timestamp; } - break + break; } } } diff --git a/src/nni_manager/yarn.lock b/src/nni_manager/yarn.lock index ab14b07403..a6b92fae96 100644 --- a/src/nni_manager/yarn.lock +++ b/src/nni_manager/yarn.lock @@ -11,14 +11,12 @@ "@babel/code-frame@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/code-frame/-/code-frame-7.8.3.tgz#33e25903d7481181534e12ec0a25f16b6fcf419e" - integrity sha512-a9gxpmdXtZEInkCSHUJDLHZVBgb1QS0jhss4cPP93EW7s+uC5bikET2twEF3KV+7rDblJcmNvTR7VJejqd2C2g== dependencies: "@babel/highlight" "^7.8.3" "@babel/core@^7.7.5": version "7.9.0" resolved "https://registry.yarnpkg.com/@babel/core/-/core-7.9.0.tgz#ac977b538b77e132ff706f3b8a4dbad09c03c56e" - integrity sha512-kWc7L0fw1xwvI0zi8OKVBuxRVefwGOrKSQMvrQ3dW+bIIavBY3/NpXmpjMy7bQnLgwgzWQZ8TlM57YHpHNHz4w== dependencies: "@babel/code-frame" "^7.8.3" "@babel/generator" "^7.9.0" @@ -40,7 +38,6 @@ "@babel/generator@^7.9.0": version "7.9.4" resolved "https://registry.yarnpkg.com/@babel/generator/-/generator-7.9.4.tgz#12441e90c3b3c4159cdecf312075bf1a8ce2dbce" - integrity sha512-rjP8ahaDy/ouhrvCoU1E5mqaitWrxwuNGU+dy1EpaoK48jZay4MdkskKGIMHLZNewg8sAsqpGSREJwP0zH3YQA== dependencies: "@babel/types" "^7.9.0" jsesc "^2.5.1" @@ -50,7 +47,6 @@ "@babel/helper-function-name@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/helper-function-name/-/helper-function-name-7.8.3.tgz#eeeb665a01b1f11068e9fb86ad56a1cb1a824cca" - integrity sha512-BCxgX1BC2hD/oBlIFUgOCQDOPV8nSINxCwM3o93xP4P9Fq6aV5sgv2cOOITDMtCfQ+3PvHp3l689XZvAM9QyOA== dependencies: "@babel/helper-get-function-arity" "^7.8.3" "@babel/template" "^7.8.3" @@ -59,28 +55,24 @@ "@babel/helper-get-function-arity@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/helper-get-function-arity/-/helper-get-function-arity-7.8.3.tgz#b894b947bd004381ce63ea1db9f08547e920abd5" - integrity sha512-FVDR+Gd9iLjUMY1fzE2SR0IuaJToR4RkCDARVfsBBPSP53GEqSFjD8gNyxg246VUyc/ALRxFaAK8rVG7UT7xRA== dependencies: "@babel/types" "^7.8.3" "@babel/helper-member-expression-to-functions@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/helper-member-expression-to-functions/-/helper-member-expression-to-functions-7.8.3.tgz#659b710498ea6c1d9907e0c73f206eee7dadc24c" - integrity sha512-fO4Egq88utkQFjbPrSHGmGLFqmrshs11d46WI+WZDESt7Wu7wN2G2Iu+NMMZJFDOVRHAMIkB5SNh30NtwCA7RA== dependencies: "@babel/types" "^7.8.3" "@babel/helper-module-imports@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/helper-module-imports/-/helper-module-imports-7.8.3.tgz#7fe39589b39c016331b6b8c3f441e8f0b1419498" - integrity sha512-R0Bx3jippsbAEtzkpZ/6FIiuzOURPcMjHp+Z6xPe6DtApDJx+w7UYyOLanZqO8+wKR9G10s/FmHXvxaMd9s6Kg== dependencies: "@babel/types" "^7.8.3" "@babel/helper-module-transforms@^7.9.0": version "7.9.0" resolved "https://registry.yarnpkg.com/@babel/helper-module-transforms/-/helper-module-transforms-7.9.0.tgz#43b34dfe15961918707d247327431388e9fe96e5" - integrity sha512-0FvKyu0gpPfIQ8EkxlrAydOWROdHpBmiCiRwLkUiBGhCUPRRbVD2/tm3sFr/c/GWFrQ/ffutGUAnx7V0FzT2wA== dependencies: "@babel/helper-module-imports" "^7.8.3" "@babel/helper-replace-supers" "^7.8.6" @@ -93,14 +85,12 @@ "@babel/helper-optimise-call-expression@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/helper-optimise-call-expression/-/helper-optimise-call-expression-7.8.3.tgz#7ed071813d09c75298ef4f208956006b6111ecb9" - integrity sha512-Kag20n86cbO2AvHca6EJsvqAd82gc6VMGule4HwebwMlwkpXuVqrNRj6CkCV2sKxgi9MyAUnZVnZ6lJ1/vKhHQ== dependencies: "@babel/types" "^7.8.3" "@babel/helper-replace-supers@^7.8.6": version "7.8.6" resolved "https://registry.yarnpkg.com/@babel/helper-replace-supers/-/helper-replace-supers-7.8.6.tgz#5ada744fd5ad73203bf1d67459a27dcba67effc8" - integrity sha512-PeMArdA4Sv/Wf4zXwBKPqVj7n9UF/xg6slNRtZW84FM7JpE1CbG8B612FyM4cxrf4fMAMGO0kR7voy1ForHHFA== dependencies: "@babel/helper-member-expression-to-functions" "^7.8.3" "@babel/helper-optimise-call-expression" "^7.8.3" @@ -110,7 +100,6 @@ "@babel/helper-simple-access@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/helper-simple-access/-/helper-simple-access-7.8.3.tgz#7f8109928b4dab4654076986af575231deb639ae" - integrity sha512-VNGUDjx5cCWg4vvCTR8qQ7YJYZ+HBjxOgXEl7ounz+4Sn7+LMD3CFrCTEU6/qXKbA2nKg21CwhhBzO0RpRbdCw== dependencies: "@babel/template" "^7.8.3" "@babel/types" "^7.8.3" @@ -118,19 +107,16 @@ "@babel/helper-split-export-declaration@^7.8.3": version "7.8.3" resolved "https://registry.yarnpkg.com/@babel/helper-split-export-declaration/-/helper-split-export-declaration-7.8.3.tgz#31a9f30070f91368a7182cf05f831781065fc7a9" - integrity sha512-3x3yOeyBhW851hroze7ElzdkeRXQYQbFIb7gLK1WQYsw2GWDay5gAJNw1sWJ0VFP6z5J1whqeXH/WCdCjZv6dA== dependencies: "@babel/types" "^7.8.3" "@babel/helper-validator-identifier@^7.9.0": version "7.9.0" resolved "https://registry.yarnpkg.com/@babel/helper-validator-identifier/-/helper-validator-identifier-7.9.0.tgz#ad53562a7fc29b3b9a91bbf7d10397fd146346ed" - integrity sha512-6G8bQKjOh+of4PV/ThDm/rRqlU7+IGoJuofpagU5GlEl29Vv0RGqqt86ZGRV8ZuSOY3o+8yXl5y782SMcG7SHw== "@babel/helpers@^7.9.0": version "7.9.2" resolved "https://registry.yarnpkg.com/@babel/helpers/-/helpers-7.9.2.tgz#b42a81a811f1e7313b88cba8adc66b3d9ae6c09f" - integrity sha512-JwLvzlXVPjO8eU9c/wF9/zOIN7X6h8DYf7mG4CiFRZRvZNKEF5dQ3H3V+ASkHoIB3mWhatgl5ONhyqHRI6MppA== dependencies: "@babel/template" "^7.8.3" "@babel/traverse" "^7.9.0" @@ -147,7 +133,6 @@ "@babel/highlight@^7.8.3": version "7.9.0" resolved "https://registry.yarnpkg.com/@babel/highlight/-/highlight-7.9.0.tgz#4e9b45ccb82b79607271b2979ad82c7b68163079" - integrity sha512-lJZPilxX7Op3Nv/2cvFdnlepPXDxi29wxteT57Q965oc5R9v86ztx0jfxVrTcBk8C2kcPkkDa2Z4T3ZsPPVWsQ== dependencies: "@babel/helper-validator-identifier" "^7.9.0" chalk "^2.0.0" @@ -156,12 +141,10 @@ "@babel/parser@^7.7.5", "@babel/parser@^7.8.6", "@babel/parser@^7.9.0": version "7.9.4" resolved "https://registry.yarnpkg.com/@babel/parser/-/parser-7.9.4.tgz#68a35e6b0319bbc014465be43828300113f2f2e8" - integrity sha512-bC49otXX6N0/VYhgOMh4gnP26E9xnDZK3TmbNpxYzzz9BQLBosQwfyOe9/cXUU3txYhTzLCbcqd5c8y/OmCjHA== "@babel/template@^7.7.4", "@babel/template@^7.8.3", "@babel/template@^7.8.6": version "7.8.6" resolved "https://registry.yarnpkg.com/@babel/template/-/template-7.8.6.tgz#86b22af15f828dfb086474f964dcc3e39c43ce2b" - integrity sha512-zbMsPMy/v0PWFZEhQJ66bqjhH+z0JgMoBWuikXybgG3Gkd/3t5oQ1Rw2WQhnSrsOmsKXnZOx15tkC4qON/+JPg== dependencies: "@babel/code-frame" "^7.8.3" "@babel/parser" "^7.8.6" @@ -170,7 +153,6 @@ "@babel/traverse@^7.7.4", "@babel/traverse@^7.8.6", "@babel/traverse@^7.9.0": version "7.9.0" resolved "https://registry.yarnpkg.com/@babel/traverse/-/traverse-7.9.0.tgz#d3882c2830e513f4fe4cec9fe76ea1cc78747892" - integrity sha512-jAZQj0+kn4WTHO5dUZkZKhbFrqZE7K5LAQ5JysMnmvGij+wOdr+8lWqPeW0BcF4wFwrEXXtdGO7wcV6YPJcf3w== dependencies: "@babel/code-frame" "^7.8.3" "@babel/generator" "^7.9.0" @@ -185,7 +167,6 @@ "@babel/types@^7.8.3", "@babel/types@^7.8.6", "@babel/types@^7.9.0": version "7.9.0" resolved "https://registry.yarnpkg.com/@babel/types/-/types-7.9.0.tgz#00b064c3df83ad32b2dbf5ff07312b15c7f1efb5" - integrity sha512-BS9JKfXkzzJl8RluW4JGknzpiUV7ZrvTayM6yfqLTVBEnFtyowVIOu6rqxRd5cVO6yGoWf4T8u8dgK9oB+GCng== dependencies: "@babel/helper-validator-identifier" "^7.9.0" lodash "^4.17.13" @@ -194,7 +175,6 @@ "@istanbuljs/load-nyc-config@^1.0.0": version "1.0.0" resolved "https://registry.yarnpkg.com/@istanbuljs/load-nyc-config/-/load-nyc-config-1.0.0.tgz#10602de5570baea82f8afbfa2630b24e7a8cfe5b" - integrity sha512-ZR0rq/f/E4f4XcgnDvtMWXCUJpi8eO0rssVhmztsZqLIEFA9UUP9zmpE0VxlM+kv/E1ul2I876Fwil2ayptDVg== dependencies: camelcase "^5.3.1" find-up "^4.1.0" @@ -204,7 +184,6 @@ "@istanbuljs/schema@^0.1.2": version "0.1.2" resolved "https://registry.yarnpkg.com/@istanbuljs/schema/-/schema-0.1.2.tgz#26520bf09abe4a5644cd5414e37125a8954241dd" - integrity sha512-tsAQNx32a8CoFhjhijUIhI4kccIAgmGhy8LZMZgGfmXcpMbPRUqn5LWmgRttILi6yeGmBJd2xsPkFMs0PzgPCw== "@sindresorhus/is@^0.7.0": version "0.7.0" @@ -234,7 +213,6 @@ "@types/color-name@^1.1.1": version "1.1.1" resolved "https://registry.yarnpkg.com/@types/color-name/-/color-name-1.1.1.tgz#1c1261bbeaa10a8055bbc5d8ab84b7b2afc846a0" - integrity sha512-rr+OQyAjxze7GgWrSaJwydHStIhHq2lvY3BOC2Mj7KnzI7XK0Uw1TOOdI9lDoajEbSWLiYgoo4f1R51erQfhPQ== "@types/connect@*": version "3.4.32" @@ -450,6 +428,12 @@ version "2.3.3" resolved "https://registry.yarnpkg.com/@types/tough-cookie/-/tough-cookie-2.3.3.tgz#7f226d67d654ec9070e755f46daebf014628e9d9" +"@types/ws@^7.2.5": + version "7.2.5" + resolved "https://registry.yarnpkg.com/@types/ws/-/ws-7.2.5.tgz#513f28b04a1ea1aa9dc2cad3f26e8e37c88aae49" + dependencies: + "@types/node" "*" + "@typescript-eslint/eslint-plugin@^2.10.0": version "2.10.0" resolved "https://registry.yarnpkg.com/@typescript-eslint/eslint-plugin/-/eslint-plugin-2.10.0.tgz#c4cb103275e555e8a7e9b3d14c5951eb6d431e70" @@ -514,7 +498,6 @@ acorn-jsx@^5.1.0: acorn@>=7.1.1, acorn@^7.1.0: version "7.1.1" resolved "https://registry.yarnpkg.com/acorn/-/acorn-7.1.1.tgz#e35668de0b402f359de515c5482a1ab9f89a69bf" - integrity sha512-add7dgA5ppRPxCFJoAGfMDi7PIBXq1RtGo7BhbLaxwrXPOmw8gq48Y9ozT01hUKy9byMjlR20EJhu5zlkErEkg== agent-base@4, agent-base@^4.3.0: version "4.3.0" @@ -544,7 +527,6 @@ aggregate-error@^1.0.0: aggregate-error@^3.0.0: version "3.0.1" resolved "https://registry.yarnpkg.com/aggregate-error/-/aggregate-error-3.0.1.tgz#db2fe7246e536f40d9b5442a39e117d7dd6a24e0" - integrity sha512-quoaXsZ9/BLNae5yiNoUz+Nhkwz83GhWwtYFglcjEQB2NDHCIpApbqXxIFnm4Pq/Nvhrsq5sYJFyohrrxnTGAA== dependencies: clean-stack "^2.0.0" indent-string "^4.0.0" @@ -576,7 +558,6 @@ ansi-align@^2.0.0: ansi-colors@3.2.3: version "3.2.3" resolved "https://registry.yarnpkg.com/ansi-colors/-/ansi-colors-3.2.3.tgz#57d35b8686e851e2cc04c403f1c00203976a1813" - integrity sha512-LEHHyuhlPY3TmuUYMh2oz89lTShfvgbmzaBcxve9t/9Wuy7Dwf4yoAKcND7KFT1HAQfqZ12qtc+DUrBMeKF9nw== ansi-escapes@^4.2.1: version "4.3.0" @@ -609,7 +590,6 @@ ansi-styles@^3.2.0, ansi-styles@^3.2.1: ansi-styles@^4.0.0: version "4.2.1" resolved "https://registry.yarnpkg.com/ansi-styles/-/ansi-styles-4.2.1.tgz#90ae75c424d008d2624c5bf29ead3177ebfcf359" - integrity sha512-9VGjrMsG1vePxcSweQsN20KY/c4zN0h9fLjqAbwbPfahM3t+NL+M9HC8xeXG2I8pX5NoamTGNuomEUFI7fcUjA== dependencies: "@types/color-name" "^1.1.1" color-convert "^2.0.1" @@ -625,7 +605,6 @@ ansistyles@~0.1.3: anymatch@~3.1.1: version "3.1.1" resolved "https://registry.yarnpkg.com/anymatch/-/anymatch-3.1.1.tgz#c55ecf02185e2469259399310c173ce31233b142" - integrity sha512-mM8522psRCqzV+6LhomX5wgp25YVibjh8Wj23I5RPkPppSVSjyKD2A2mBJmWGa+KN7f2D6LNh9jkBCeyLktzjg== dependencies: normalize-path "^3.0.0" picomatch "^2.0.4" @@ -633,7 +612,6 @@ anymatch@~3.1.1: append-transform@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/append-transform/-/append-transform-2.0.0.tgz#99d9d29c7b38391e6f428d28ce136551f0b77e12" - integrity sha512-7yeyCEurROLQJFv5Xj4lEGTy0borxepjFv1g22oAdqFu//SrAlDl1O1Nxx15SH1RoliUml6p8dwJW9jvZughhg== dependencies: default-require-extensions "^3.0.0" @@ -764,7 +742,6 @@ bin-links@^1.1.2, bin-links@^1.1.6: binary-extensions@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/binary-extensions/-/binary-extensions-2.0.0.tgz#23c0df14f6a88077f5f986c0d167ec03c3d5537c" - integrity sha512-Phlt0plgpIIBOGTT/ehfFnbNlfsDEiqmzE2KRXoX1bLIlir4X/MR+zSyBEkL05ffWgnRSf/DXv+WrUAVr93/ow== bluebird@^3.5.1, bluebird@^3.5.3, bluebird@^3.5.5: version "3.7.2" @@ -813,7 +790,6 @@ brace-expansion@^1.1.7: braces@~3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/braces/-/braces-3.0.2.tgz#3454e1a462ee8d599e236df336cd9ea4f8afe107" - integrity sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A== dependencies: fill-range "^7.0.1" @@ -884,7 +860,6 @@ cacheable-request@^2.1.1: caching-transform@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/caching-transform/-/caching-transform-4.0.0.tgz#00d297a4206d71e2163c39eaffa8157ac0651f0f" - integrity sha512-kpqOvwXnjjN44D89K5ccQC+RUrsy7jB/XLlRrx0D7/2HNcTPqzsb6XgYoErwko6QsV184CA2YgS1fxDiiDZMWA== dependencies: hasha "^5.0.0" make-dir "^3.0.0" @@ -910,7 +885,6 @@ camelcase@^4.0.0, camelcase@^4.1.0: camelcase@^5.0.0, camelcase@^5.3.1: version "5.3.1" resolved "https://registry.yarnpkg.com/camelcase/-/camelcase-5.3.1.tgz#e3c9b31569e106811df242f715725a1f4c494320" - integrity sha512-L28STB170nwWS63UjtlEOE3dldQApaJXZkOI1uMFfzf3rRuPegHaHesyee+YxQ+W6SvRDQV6UrdOdRiR153wJg== capture-stack-trace@^1.0.0: version "1.0.1" @@ -972,7 +946,6 @@ child-process-promise@^2.2.1: chokidar@3.3.0: version "3.3.0" resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.3.0.tgz#12c0714668c55800f659e262d4962a97faf554a6" - integrity sha512-dGmKLDdT3Gdl7fBUe8XK+gAtGmzy5Fn0XkkWQuYxGIgWVPPse2CxFA5mtrlD0TOHaHjEUqkWNyP1XdHoJES/4A== dependencies: anymatch "~3.1.1" braces "~3.0.2" @@ -1013,7 +986,6 @@ clean-stack@^1.0.0: clean-stack@^2.0.0: version "2.2.0" resolved "https://registry.yarnpkg.com/clean-stack/-/clean-stack-2.2.0.tgz#ee8472dbb129e727b31e8a10a427dee9dfe4008b" - integrity sha512-4diC9HaTE+KRAMWhDhrGOECgWZxoevMc5TlkObMqNSsVU62PYzXZ/SMTjzyGAFF1YusgxGcSWTEXBhp0CPwQ1A== cli-boxes@^1.0.0: version "1.0.0" @@ -1056,7 +1028,6 @@ cliui@^4.0.0: cliui@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/cliui/-/cliui-5.0.0.tgz#deefcfdb2e800784aa34f46fa08e06851c7bbbc5" - integrity sha512-PYeGSEmmHM6zvoef2w8TPzlrnNpXIjTipYK780YswmIP9vjxmd6Y2a3CB2Ks6/AU8NHjZugXvo8w3oWM2qnwXA== dependencies: string-width "^3.1.0" strip-ansi "^5.2.0" @@ -1065,7 +1036,6 @@ cliui@^5.0.0: cliui@^6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/cliui/-/cliui-6.0.0.tgz#511d702c0c4e41ca156d7d0e96021f23e13225b1" - integrity sha512-t6wbgtoCXvAzst7QgXxJYqPt0usEfbgQdftEPbLL/cvv6HPE5VgvqCuAIDR0NgU52ds6rFwqrgakNLrHEjCbrQ== dependencies: string-width "^4.2.0" strip-ansi "^6.0.0" @@ -1105,7 +1075,6 @@ color-convert@^1.9.0: color-convert@^2.0.1: version "2.0.1" resolved "https://registry.yarnpkg.com/color-convert/-/color-convert-2.0.1.tgz#72d3a68d598c9bdb3af2ad1e84f21d896abd4de3" - integrity sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ== dependencies: color-name "~1.1.4" @@ -1116,7 +1085,6 @@ color-name@1.1.1: color-name@~1.1.4: version "1.1.4" resolved "https://registry.yarnpkg.com/color-name/-/color-name-1.1.4.tgz#c2a09a87acbde69543de6f63fa3995c826c536a2" - integrity sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA== colors@^1.1.2: version "1.4.0" @@ -1191,7 +1159,6 @@ content-type@~1.0.4: convert-source-map@^1.7.0: version "1.7.0" resolved "https://registry.yarnpkg.com/convert-source-map/-/convert-source-map-1.7.0.tgz#17a2cb882d7f77d3490585e2ce6c524424a3a442" - integrity sha512-4FJkXzKXEDB1snCFZlLP4gpC3JILicCpGbzG9f9G7tGqGCzETQ2hWPrcinA9oU4wtf2biUaEH5065UnMeR33oA== dependencies: safe-buffer "~5.1.1" @@ -1252,7 +1219,6 @@ cross-spawn@^6.0.0, cross-spawn@^6.0.5: cross-spawn@^7.0.0: version "7.0.1" resolved "https://registry.yarnpkg.com/cross-spawn/-/cross-spawn-7.0.1.tgz#0ab56286e0f7c24e153d04cc2aa027e43a9a5d14" - integrity sha512-u7v4o84SwFpD32Z8IIcPZ6z1/ie24O6RU3RbtL5Y316l3KuHVPx9ItBgWQ6VlfAFnRnTtMUrsQ9MUUTuEZjogg== dependencies: path-key "^3.1.0" shebang-command "^2.0.0" @@ -1335,7 +1301,6 @@ deepmerge@^2.1.1: default-require-extensions@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/default-require-extensions/-/default-require-extensions-3.0.0.tgz#e03f93aac9b2b6443fc52e5e4a37b3ad9ad8df96" - integrity sha512-ek6DpXq/SCpvjhpFsLFRVtIxJCRw6fUR42lYMVZuUMK7n8eMz4Uh5clckdBjEpLhn/gEBZo7hDJnJcwdKLKQjg== dependencies: strip-bom "^4.0.0" @@ -1746,7 +1711,6 @@ file-entry-cache@^5.0.1: fill-range@^7.0.1: version "7.0.1" resolved "https://registry.yarnpkg.com/fill-range/-/fill-range-7.0.1.tgz#1919a6a7c75fe38b2c7c77e5198535da9acdda40" - integrity sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ== dependencies: to-regex-range "^5.0.1" @@ -1765,7 +1729,6 @@ finalhandler@1.1.1: find-cache-dir@^3.2.0: version "3.3.1" resolved "https://registry.yarnpkg.com/find-cache-dir/-/find-cache-dir-3.3.1.tgz#89b33fad4a4670daa94f855f7fbe31d6d84fe880" - integrity sha512-t2GDMt3oGC/v+BMwzmllWDuJF/xcDtE5j/fCGbqDD7OLuJkj0cfh1YSA5VKPvwMeLFLNDBkwOKZ2X85jGLVftQ== dependencies: commondir "^1.0.1" make-dir "^3.0.2" @@ -1790,7 +1753,6 @@ find-up@^2.1.0: find-up@^4.0.0, find-up@^4.1.0: version "4.1.0" resolved "https://registry.yarnpkg.com/find-up/-/find-up-4.1.0.tgz#97afe7d6cdc0bc5928584b7c8d7b16e8a9aa5d19" - integrity sha512-PpOwAdQ/YlXQ2vj8a3h8IipDuYRi3wceVQQGYWxNINccq40Anw7BlsEXCMbt1Zt+OLA6Fq9suIpIWD0OsnISlw== dependencies: locate-path "^5.0.0" path-exists "^4.0.0" @@ -1806,7 +1768,6 @@ flat-cache@^2.0.1: flat@^4.1.0: version "4.1.0" resolved "https://registry.yarnpkg.com/flat/-/flat-4.1.0.tgz#090bec8b05e39cba309747f1d588f04dbaf98db2" - integrity sha512-Px/TiLIznH7gEDlPXcUD4KnBusa6kR6ayRUVcnEAbreRIuhkqow/mun59BuRXwoYk7ZQOLW1ZM05ilIvK38hFw== dependencies: is-buffer "~2.0.3" @@ -1830,7 +1791,6 @@ flush-write-stream@^1.0.0: foreground-child@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/foreground-child/-/foreground-child-2.0.0.tgz#71b32800c9f15aa8f2f83f4a6bd9bff35d861a53" - integrity sha512-dCIq9FpEcyQyXKCkyzmlPTFNgrCzPudOe+mhvJU5zAtlBnGVy2yKxtfsxK2tQBThwq225jcvBjpw1Gr40uzZCA== dependencies: cross-spawn "^7.0.0" signal-exit "^3.0.2" @@ -1872,7 +1832,6 @@ from2@^2.1.0, from2@^2.1.1: fromentries@^1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/fromentries/-/fromentries-1.2.0.tgz#e6aa06f240d6267f913cea422075ef88b63e7897" - integrity sha512-33X7H/wdfO99GdRLLgkjUrD4geAFdq/Uv0kl3HD4da6HDixd2GUg8Mw7dahLCV9r/EARkmtYBB6Tch4EEokFTQ== fs-minipass@^1.2.5: version "1.2.5" @@ -1904,7 +1863,6 @@ fs.realpath@^1.0.0: fsevents@~2.1.1: version "2.1.2" resolved "https://registry.yarnpkg.com/fsevents/-/fsevents-2.1.2.tgz#4c0a1fb34bc68e543b4b82a9ec392bfbda840805" - integrity sha512-R4wDiBwZ0KzpgOWetKDug1FZcYhqYnUYKtfZYt4mD5SBz76q0KR4Q9o7GIPamsVPGmW3EYPPJ0dOOjvx32ldZA== function-bind@^1.1.1: version "1.1.1" @@ -1934,7 +1892,6 @@ genfun@^5.0.0: gensync@^1.0.0-beta.1: version "1.0.0-beta.1" resolved "https://registry.yarnpkg.com/gensync/-/gensync-1.0.0-beta.1.tgz#58f4361ff987e5ff6e1e7a210827aa371eaac269" - integrity sha512-r8EC6NO1sngH/zdD9fiRDLdcgnbayXah+mLgManTaIZJqEC1MZstmnox8KpnI2/fxQwrp5OpCOYWLp4rBl4Jcg== gentle-fs@^2.3.0: version "2.3.0" @@ -1959,7 +1916,6 @@ get-caller-file@^1.0.1: get-caller-file@^2.0.1: version "2.0.5" resolved "https://registry.yarnpkg.com/get-caller-file/-/get-caller-file-2.0.5.tgz#4f94412a82db32f36e3b0b9741f8a97feb031f7e" - integrity sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg== get-func-name@^2.0.0: version "2.0.0" @@ -1997,7 +1953,6 @@ glob-parent@^5.0.0: glob-parent@~5.1.0: version "5.1.1" resolved "https://registry.yarnpkg.com/glob-parent/-/glob-parent-5.1.1.tgz#b6c1ef417c4e5663ea498f1c45afac6916bbc229" - integrity sha512-FnI+VGOpnlGHWZxthPGR+QhR78fuiK0sNLkHQv+bL9fQi57lNNdquIbna/WrfROrolq8GK5Ek6BiMwqL/voRYQ== dependencies: is-glob "^4.0.1" @@ -2135,7 +2090,6 @@ has-flag@^3.0.0: has-flag@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/has-flag/-/has-flag-4.0.0.tgz#944771fd9c81c81265c4d6941860da06bb59479b" - integrity sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ== has-symbol-support-x@^1.4.1: version "1.4.2" @@ -2171,7 +2125,6 @@ hash-base@^3.0.0: hasha@^5.0.0: version "5.2.0" resolved "https://registry.yarnpkg.com/hasha/-/hasha-5.2.0.tgz#33094d1f69c40a4a6ac7be53d5fe3ff95a269e0c" - integrity sha512-2W+jKdQbAdSIrggA8Q35Br8qKadTrqCTC8+XZvBWepKDK6m9XkX6Iz1a2yh2KP01kzAR/dpuMeUnocoLYDcskw== dependencies: is-stream "^2.0.0" type-fest "^0.8.0" @@ -2179,7 +2132,6 @@ hasha@^5.0.0: he@1.2.0: version "1.2.0" resolved "https://registry.yarnpkg.com/he/-/he-1.2.0.tgz#84ae65fa7eafb165fddb61566ae14baf05664f0f" - integrity sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw== hoek@2.x.x, hoek@^4.2.1: version "4.2.1" @@ -2196,7 +2148,6 @@ hosted-git-info@^2.7.1, hosted-git-info@^2.8.5: html-escaper@^2.0.0: version "2.0.2" resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453" - integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg== http-cache-semantics@3.8.1, http-cache-semantics@^3.8.1: version "3.8.1" @@ -2304,7 +2255,6 @@ indent-string@^3.0.0: indent-string@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/indent-string/-/indent-string-4.0.0.tgz#624f8f4497d619b2d9768531d58f4122854d7251" - integrity sha512-EdDDZu4A2OyIK7Lr/2zG+w5jmbuk1DVBnEwREQvBzspBJkCEbRa8GxU1lghYcaGJCnRWibjDXlq779X1/y5xwg== infer-owner@^1.0.3, infer-owner@^1.0.4: version "1.0.4" @@ -2386,14 +2336,12 @@ ipaddr.js@1.6.0: is-binary-path@~2.1.0: version "2.1.0" resolved "https://registry.yarnpkg.com/is-binary-path/-/is-binary-path-2.1.0.tgz#ea1f7f3b80f064236e83470f86c09c254fb45b09" - integrity sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw== dependencies: binary-extensions "^2.0.0" is-buffer@~2.0.3: version "2.0.4" resolved "https://registry.yarnpkg.com/is-buffer/-/is-buffer-2.0.4.tgz#3e572f23c8411a5cfd9557c849e3665e0b290623" - integrity sha512-Kq1rokWXOPXWuaMAqZiJW4XxsmD9zGx9q4aePabbn3qCRGedtH7Cm+zV8WETitMfu1wdh+Rvd6w5egwSngUX2A== is-callable@^1.1.4: version "1.1.4" @@ -2459,7 +2407,6 @@ is-npm@^1.0.0: is-number@^7.0.0: version "7.0.0" resolved "https://registry.yarnpkg.com/is-number/-/is-number-7.0.0.tgz#7535345b896734d5f80c4d06c50955527a14f12b" - integrity sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng== is-obj@^1.0.0: version "1.0.1" @@ -2508,7 +2455,6 @@ is-stream@^1.0.0, is-stream@^1.1.0: is-stream@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/is-stream/-/is-stream-2.0.0.tgz#bde9c32680d6fae04129d6ac9d921ce7815f78e3" - integrity sha512-XCoy+WlUr7d1+Z8GgSuXmpuUFC9fOhRXglJMx+dwLKTkL44Cjd4W1Z5P+BQZpr+cR93aGP4S/s7Ftw6Nd/kiEw== is-symbol@^1.0.2: version "1.0.3" @@ -2523,7 +2469,6 @@ is-typedarray@^1.0.0, is-typedarray@~1.0.0: is-windows@^1.0.2: version "1.0.2" resolved "https://registry.yarnpkg.com/is-windows/-/is-windows-1.0.2.tgz#d1850eb9791ecd18e6182ce12a30f396634bb19d" - integrity sha512-eXK1UInq2bPmjyX6e3VHIzMLobc4J94i4AWn+Hpq3OU5KkrRC96OAcR3PRJ/pGu6m8TRnBHP9dkXQVsT/COVIA== is@^3.2.1: version "3.3.0" @@ -2552,19 +2497,16 @@ isstream@~0.1.2: istanbul-lib-coverage@^3.0.0, istanbul-lib-coverage@^3.0.0-alpha.1: version "3.0.0" resolved "https://registry.yarnpkg.com/istanbul-lib-coverage/-/istanbul-lib-coverage-3.0.0.tgz#f5944a37c70b550b02a78a5c3b2055b280cec8ec" - integrity sha512-UiUIqxMgRDET6eR+o5HbfRYP1l0hqkWOs7vNxC/mggutCMUIhWMm8gAHb8tHlyfD3/l6rlgNA5cKdDzEAf6hEg== istanbul-lib-hook@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/istanbul-lib-hook/-/istanbul-lib-hook-3.0.0.tgz#8f84c9434888cc6b1d0a9d7092a76d239ebf0cc6" - integrity sha512-Pt/uge1Q9s+5VAZ+pCo16TYMWPBIl+oaNIjgLQxcX0itS6ueeaA+pEfThZpH8WxhFgCiEb8sAJY6MdUKgiIWaQ== dependencies: append-transform "^2.0.0" istanbul-lib-instrument@^4.0.0: version "4.0.1" resolved "https://registry.yarnpkg.com/istanbul-lib-instrument/-/istanbul-lib-instrument-4.0.1.tgz#61f13ac2c96cfefb076fe7131156cc05907874e6" - integrity sha512-imIchxnodll7pvQBYOqUu88EufLCU56LMeFPZZM/fJZ1irYcYdqroaV+ACK1Ila8ls09iEYArp+nqyC6lW1Vfg== dependencies: "@babel/core" "^7.7.5" "@babel/parser" "^7.7.5" @@ -2577,7 +2519,6 @@ istanbul-lib-instrument@^4.0.0: istanbul-lib-processinfo@^2.0.2: version "2.0.2" resolved "https://registry.yarnpkg.com/istanbul-lib-processinfo/-/istanbul-lib-processinfo-2.0.2.tgz#e1426514662244b2f25df728e8fd1ba35fe53b9c" - integrity sha512-kOwpa7z9hme+IBPZMzQ5vdQj8srYgAtaRqeI48NGmAQ+/5yKiHLV0QbYqQpxsdEF0+w14SoB8YbnHKcXE2KnYw== dependencies: archy "^1.0.0" cross-spawn "^7.0.0" @@ -2590,7 +2531,6 @@ istanbul-lib-processinfo@^2.0.2: istanbul-lib-report@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/istanbul-lib-report/-/istanbul-lib-report-3.0.0.tgz#7518fe52ea44de372f460a76b5ecda9ffb73d8a6" - integrity sha512-wcdi+uAKzfiGT2abPpKZ0hSU1rGQjUQnLvtY5MpQ7QCTahD3VODhcu4wcfY1YtkGaDD5yuydOLINXsfbus9ROw== dependencies: istanbul-lib-coverage "^3.0.0" make-dir "^3.0.0" @@ -2599,7 +2539,6 @@ istanbul-lib-report@^3.0.0: istanbul-lib-source-maps@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/istanbul-lib-source-maps/-/istanbul-lib-source-maps-4.0.0.tgz#75743ce6d96bb86dc7ee4352cf6366a23f0b1ad9" - integrity sha512-c16LpFRkR8vQXyHZ5nLpY35JZtzj1PQY1iZmesUbf1FZHbIupcWfjgOXBY9YHkLEQ6puz1u4Dgj6qmU/DisrZg== dependencies: debug "^4.1.1" istanbul-lib-coverage "^3.0.0" @@ -2608,7 +2547,6 @@ istanbul-lib-source-maps@^4.0.0: istanbul-reports@^3.0.0: version "3.0.1" resolved "https://registry.yarnpkg.com/istanbul-reports/-/istanbul-reports-3.0.1.tgz#1343217244ad637e0c3b18e7f6b746941a9b5e9a" - integrity sha512-Vm9xwCiQ8t2cNNnckyeAV0UdxKpcQUz4nMxsBvIu8n2kmPSiyb5uaF/8LpmKr+yqL/MdOXaX2Nmdo4Qyxium9Q== dependencies: html-escaper "^2.0.0" istanbul-lib-report "^3.0.0" @@ -2689,7 +2627,6 @@ json-stringify-safe@~5.0.1: json5@^2.1.2: version "2.1.2" resolved "https://registry.yarnpkg.com/json5/-/json5-2.1.2.tgz#43ef1f0af9835dd624751a6b7fa48874fb2d608e" - integrity sha512-MoUOQ4WdiN3yxhm7NEVJSJrieAo5hNSLQ5sj05OTRHPL9HOBy8u4Bu88jsC1jvqAdN+E1bJmsUcZH+1HQxliqQ== dependencies: minimist "^1.2.5" @@ -2892,7 +2829,6 @@ locate-path@^3.0.0: locate-path@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/locate-path/-/locate-path-5.0.0.tgz#1afba396afd676a6d42504d0a67a3a7eb9f62aa0" - integrity sha512-t7hw9pI+WvuwNJXwk5zVHpyhIqzg2qTlklJOf0mVxGSbe3Fp2VieZcduNYjaLDoy6p9uGpQEGWG87WpMKlNq8g== dependencies: p-locate "^4.1.0" @@ -2991,7 +2927,6 @@ lodash@^4.17.11, lodash@^4.17.13, lodash@^4.17.14, lodash@^4.17.15: log-symbols@3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/log-symbols/-/log-symbols-3.0.0.tgz#f3a08516a5dea893336a7dee14d18a1cfdab77c4" - integrity sha512-dSkNGuI7iG3mfvDzUuYZyvk5dD9ocYCYzNU6CYDE6+Xqd+gwme6Z00NS3dUh8mq/73HaEtT7m6W+yUPtU6BZnQ== dependencies: chalk "^2.4.2" @@ -3036,7 +2971,6 @@ make-dir@^1.0.0: make-dir@^3.0.0, make-dir@^3.0.2: version "3.0.2" resolved "https://registry.yarnpkg.com/make-dir/-/make-dir-3.0.2.tgz#04a1acbf22221e1d6ef43559f43e05a90dbb4392" - integrity sha512-rYKABKutXa6vXTXhoV18cBE7PaewPXHe/Bdq4v+ZLMhxbWApkFFplT0LcbMW+6BbjnQXzZ/sAvSE/JdguApG5w== dependencies: semver "^6.0.0" @@ -3138,7 +3072,6 @@ minimatch@3.0.4, minimatch@^3.0.4: minimist@^1.2.0, minimist@^1.2.5: version "1.2.5" resolved "https://registry.yarnpkg.com/minimist/-/minimist-1.2.5.tgz#67d66014b66a6a8aaa0c083c5fd58df4e4e97602" - integrity sha1-Z9ZgFLZqaoqqDAg8X9WN9OTpdgI= minipass@^2.2.1, minipass@^2.3.3: version "2.3.3" @@ -3184,14 +3117,12 @@ mississippi@^3.0.0: mkdirp@0.5.3, mkdirp@^0.5.0, mkdirp@^0.5.1, mkdirp@~0.5.0, mkdirp@~0.5.1: version "0.5.3" resolved "https://registry.yarnpkg.com/mkdirp/-/mkdirp-0.5.3.tgz#5a514b7179259287952881e94410ec5465659f8c" - integrity sha512-P+2gwrFqx8lhew375MQHHeTlY8AuOJSrGf0R5ddkEndUkmwpgUob/vQuBD1V22/Cw1/lJr4x+EjllSezBThzBg== dependencies: minimist "^1.2.5" mocha@^7.1.1: version "7.1.1" resolved "https://registry.yarnpkg.com/mocha/-/mocha-7.1.1.tgz#89fbb30d09429845b1bb893a830bf5771049a441" - integrity sha512-3qQsu3ijNS3GkWcccT5Zw0hf/rWvu1fTN9sPvEd81hlwsr30GX2GcDSSoBxo24IR8FelmrAydGC6/1J5QQP4WA== dependencies: ansi-colors "3.2.3" browser-stdout "1.3.1" @@ -3276,7 +3207,6 @@ nice-try@^1.0.4: node-environment-flags@1.0.6: version "1.0.6" resolved "https://registry.yarnpkg.com/node-environment-flags/-/node-environment-flags-1.0.6.tgz#a30ac13621f6f7d674260a54dede048c3982c088" - integrity sha512-5Evy2epuL+6TM0lCQGpFIj6KwiEsGh1SrHUhTbNX+sLbBtjidPZFAnVK9y5yU1+h//RitLbRHTIMyxQPtxMdHw== dependencies: object.getownpropertydescriptors "^2.0.3" semver "^5.7.0" @@ -3347,7 +3277,6 @@ node-pre-gyp@^0.10.3: node-preload@^0.2.0: version "0.2.1" resolved "https://registry.yarnpkg.com/node-preload/-/node-preload-0.2.1.tgz#c03043bb327f417a18fee7ab7ee57b408a144301" - integrity sha512-RM5oyBy45cLEoHqCeh+MNuFAxO0vTFBLskvQbOKnEE7YTTSN4tbN8QWDIPQ6L+WvKsB/qLEGpYe2ZZ9d4W9OIQ== dependencies: process-on-spawn "^1.0.0" @@ -3387,7 +3316,6 @@ normalize-package-data@^2.0.0, normalize-package-data@^2.4.0, normalize-package- normalize-path@^3.0.0, normalize-path@~3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/normalize-path/-/normalize-path-3.0.0.tgz#0dcd69ff23a1c9b11fd0978316644a0388216a65" - integrity sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA== normalize-url@2.0.1: version "2.0.1" @@ -3643,7 +3571,6 @@ number-is-nan@^1.0.0: nyc@^15.0.0: version "15.0.0" resolved "https://registry.yarnpkg.com/nyc/-/nyc-15.0.0.tgz#eb32db2c0f29242c2414fe46357f230121cfc162" - integrity sha512-qcLBlNCKMDVuKb7d1fpxjPR8sHeMVX0CHarXAVzrVWoFrigCkYR8xcrjfXSPi5HXM7EU78L6ywO7w1c5rZNCNg== dependencies: "@istanbuljs/load-nyc-config" "^1.0.0" "@istanbuljs/schema" "^0.1.2" @@ -3825,7 +3752,6 @@ p-limit@^2.0.0: p-limit@^2.2.0: version "2.2.2" resolved "https://registry.yarnpkg.com/p-limit/-/p-limit-2.2.2.tgz#61279b67721f5287aa1c13a9a7fbbc48c9291b1e" - integrity sha512-WGR+xHecKTr7EbUEhyLSh5Dube9JtdiG78ufaeLxTgpudf/20KqyMioIUZJAezlTIi6evxuoUs9YXc11cU+yzQ== dependencies: p-try "^2.0.0" @@ -3844,14 +3770,12 @@ p-locate@^3.0.0: p-locate@^4.1.0: version "4.1.0" resolved "https://registry.yarnpkg.com/p-locate/-/p-locate-4.1.0.tgz#a3428bb7088b3a60292f66919278b7c297ad4f07" - integrity sha512-R79ZZ/0wAxKGu3oYMlz8jy/kbhsNrS7SKZ7PxEHBgJ5+F2mtFW2fK2cOtBh1cHYkQsbzFV7I+EoRKe6Yt0oK7A== dependencies: p-limit "^2.2.0" p-map@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/p-map/-/p-map-3.0.0.tgz#d704d9af8a2ba684e2600d9a215983d4141a979d" - integrity sha512-d3qXVTF/s+W+CdJ5A29wywV2n8CQQYahlgz2bFiA+4eVNJbHJodPZ+/gXwPGh0bOqA+j8S+6+ckmvLGPk1QpxQ== dependencies: aggregate-error "^3.0.0" @@ -3878,7 +3802,6 @@ p-try@^2.0.0: package-hash@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/package-hash/-/package-hash-4.0.0.tgz#3537f654665ec3cc38827387fc904c163c54f506" - integrity sha512-whdkPIooSu/bASggZ96BWVvZTRMOFxnyUG5PnTSGKoJE2gd5mbVNmR2Nj20QFzxYYgAXpoqC+AiXzl+UMRh7zQ== dependencies: graceful-fs "^4.1.15" hasha "^5.0.0" @@ -3964,7 +3887,6 @@ path-exists@^3.0.0: path-exists@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/path-exists/-/path-exists-4.0.0.tgz#513bdbe2d3b95d7762e8c1137efa195c6c61b5b3" - integrity sha512-ak9Qy5Q7jYb2Wwcey5Fpvg2KoAc/ZIhLSLOSBmRmygPsGwkVVt0fZa0qrtMz+m6tJTAHfZQ8FnmB4MG4LWy7/w== path-is-absolute@^1.0.0: version "1.0.1" @@ -3981,7 +3903,6 @@ path-key@^2.0.0, path-key@^2.0.1: path-key@^3.1.0: version "3.1.1" resolved "https://registry.yarnpkg.com/path-key/-/path-key-3.1.1.tgz#581f6ade658cbba65a0d3380de7753295054f375" - integrity sha512-ojmeN0qd+y0jszEtoY48r0Peq5dwMEkIlCOu6Q5f41lfkswXuKtYrhgoTpLnyIcHm24Uhqx+5Tqm2InSwLhE6Q== path-parse@^1.0.6: version "1.0.6" @@ -4002,7 +3923,6 @@ performance-now@^2.1.0: picomatch@^2.0.4: version "2.2.2" resolved "https://registry.yarnpkg.com/picomatch/-/picomatch-2.2.2.tgz#21f333e9b6b8eaff02468f5146ea406d345f4dad" - integrity sha512-q0M/9eZHzmr0AulXyPwNfZjtwZ/RBZlbN3K3CErVrk50T2ASYI7Bye0EvekFY3IP1Nt2DHu0re+V2ZHIpMkuWg== pify@^2.0.0: version "2.3.0" @@ -4025,7 +3945,6 @@ pinkie@^2.0.0: pkg-dir@^4.1.0: version "4.2.0" resolved "https://registry.yarnpkg.com/pkg-dir/-/pkg-dir-4.2.0.tgz#f099133df7ede422e81d1d8448270eeb3e4261f3" - integrity sha512-HRDzbaKjC+AOWVXxAU/x54COGeIv9eb+6CkDSQoNTt4XyWoIJvuPsXizxu/Fr23EiekbtZwmh1IcIG/l/a10GQ== dependencies: find-up "^4.0.0" @@ -4052,7 +3971,6 @@ process-nextick-args@~2.0.0: process-on-spawn@^1.0.0: version "1.0.0" resolved "https://registry.yarnpkg.com/process-on-spawn/-/process-on-spawn-1.0.0.tgz#95b05a23073d30a17acfdc92a440efd2baefdc93" - integrity sha512-1WsPDsUSMmZH5LeMLegqkPDrsGgsWwk1Exipy2hvB0o/F0ASzbpIctSCcZIK1ykJvtTJULEH+20WOFjMvGnCTg== dependencies: fromentries "^1.2.0" @@ -4305,7 +4223,6 @@ readdir-scoped-modules@^1.0.0, readdir-scoped-modules@^1.1.0: readdirp@~3.2.0: version "3.2.0" resolved "https://registry.yarnpkg.com/readdirp/-/readdirp-3.2.0.tgz#c30c33352b12c96dfb4b895421a49fd5a9593839" - integrity sha512-crk4Qu3pmXwgxdSgGhgA/eXiJAPQiX4GMOZZMXnqKxHX7TaoL+3gQVo/WeuAiogr07DpnfjIMpXXa+PAIvwPGQ== dependencies: picomatch "^2.0.4" @@ -4409,7 +4326,6 @@ require-main-filename@^1.0.1: require-main-filename@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/require-main-filename/-/require-main-filename-2.0.0.tgz#d0b329ecc7cc0f61649f62215be69af54aa8989b" - integrity sha512-NKN5kMDylKuldxYLSUfrbo5Tuzh4hd+2E8NPPX02mZtn1VuREQToYe/ZdlJy+J3uCpfaiGF05e7B8W0iXbQHmg== resolve-from@^4.0.0: version "4.0.0" @@ -4418,7 +4334,6 @@ resolve-from@^4.0.0: resolve-from@^5.0.0: version "5.0.0" resolved "https://registry.yarnpkg.com/resolve-from/-/resolve-from-5.0.0.tgz#c35225843df8f776df21c57557bc087e9dfdfc69" - integrity sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw== resolve@^1.10.0: version "1.13.1" @@ -4429,7 +4344,6 @@ resolve@^1.10.0: resolve@^1.3.2: version "1.15.1" resolved "https://registry.yarnpkg.com/resolve/-/resolve-1.15.1.tgz#27bdcdeffeaf2d6244b95bb0f9f4b4653451f3e8" - integrity sha512-84oo6ZTtoTUpjgNEr5SJyzQhzL72gaRodsSfyxC/AXRvwu0Yse9H8eF9IpGo7b8YetZhlI6v7ZQ6bKBFV/6S7w== dependencies: path-parse "^1.0.6" @@ -4475,7 +4389,6 @@ rimraf@^2.6.1, rimraf@^2.6.2: rimraf@^3.0.0: version "3.0.2" resolved "https://registry.yarnpkg.com/rimraf/-/rimraf-3.0.2.tgz#f1a5402ba6220ad52cc1282bac1ae3aa49fd061a" - integrity sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA== dependencies: glob "^7.1.3" @@ -4607,7 +4520,6 @@ shebang-command@^1.2.0: shebang-command@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/shebang-command/-/shebang-command-2.0.0.tgz#ccd0af4f8835fbdc265b82461aaf0c36663f34ea" - integrity sha512-kHxr2zZpYtdmrN1qDjrrX/Z1rR1kG8Dx+gkpK1G4eXmvXswmcE1hTWBWYUzlraYw1/yZp6YuDY77YtvbN0dmDA== dependencies: shebang-regex "^3.0.0" @@ -4618,7 +4530,6 @@ shebang-regex@^1.0.0: shebang-regex@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/shebang-regex/-/shebang-regex-3.0.0.tgz#ae16f1644d873ecad843b0307b143362d4c42172" - integrity sha512-7++dFhtcx3353uBaq8DDR4NuxBetBzC7ZQOhmTQInHEd6bSrXdiEyzCvG07Z44UYdLShWUyXt5M/yhz8ekcb1A== signal-exit@^3.0.0, signal-exit@^3.0.2: version "3.0.2" @@ -4693,7 +4604,6 @@ source-map@^0.6.0, source-map@^0.6.1: spawn-wrap@^2.0.0: version "2.0.0" resolved "https://registry.yarnpkg.com/spawn-wrap/-/spawn-wrap-2.0.0.tgz#103685b8b8f9b79771318827aa78650a610d457e" - integrity sha512-EeajNjfN9zMnULLwhZZQU3GWBoFNkbngTUPfaawT4RkMiviTxcX0qfhVbGey39mfctfDHkWtuecgQ8NJcyQWHg== dependencies: foreground-child "^2.0.0" is-windows "^1.0.2" @@ -4909,7 +4819,6 @@ strip-ansi@^6.0.0: strip-bom@^4.0.0: version "4.0.0" resolved "https://registry.yarnpkg.com/strip-bom/-/strip-bom-4.0.0.tgz#9c3505c1db45bcedca3d9cf7a16f5c5aa3901878" - integrity sha512-3xurFv5tEgii33Zi8Jtp55wEIILR9eh34FAW00PZf+JnSsTmV/ioewSgQl97JHvgjoRGwPShsWm+IdrxB35d0w== strip-eof@^1.0.0: version "1.0.0" @@ -4926,7 +4835,6 @@ strip-json-comments@^3.0.1: supports-color@6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-6.0.0.tgz#76cfe742cf1f41bb9b1c29ad03068c05b4c0e40a" - integrity sha512-on9Kwidc1IUQo+bQdhi8+Tijpo0e1SS6RoGo2guUwn5vdaxw8RXOF9Vb2ws+ihWOmh4JnCJOvaziZWP1VABaLg== dependencies: has-flag "^3.0.0" @@ -4939,7 +4847,6 @@ supports-color@^5.3.0: supports-color@^7.1.0: version "7.1.0" resolved "https://registry.yarnpkg.com/supports-color/-/supports-color-7.1.0.tgz#68e32591df73e25ad1c4b49108a2ec507962bfd1" - integrity sha512-oRSIpR8pxT1Wr2FquTNnGet79b3BWljqOuoW/h4oBhxJ/HUbX5nX6JSruTkvXDCFMwDPvsaTTbvMLKZWSy0R5g== dependencies: has-flag "^4.0.0" @@ -4989,7 +4896,6 @@ term-size@^1.2.0: test-exclude@^6.0.0: version "6.0.0" resolved "https://registry.yarnpkg.com/test-exclude/-/test-exclude-6.0.0.tgz#04a8698661d805ea6fa293b6cb9e63ac044ef15e" - integrity sha512-cAGWPIyOHU6zlmg88jwm7VRyXnMN7iV68OGAbYDk/Mh/xC/pzVPlQtY6ngoIH/5/tciuhGfvESU8GrHrcxD56w== dependencies: "@istanbuljs/schema" "^0.1.2" glob "^7.1.4" @@ -5031,7 +4937,6 @@ to-fast-properties@^2.0.0: to-regex-range@^5.0.1: version "5.0.1" resolved "https://registry.yarnpkg.com/to-regex-range/-/to-regex-range-5.0.1.tgz#1648c44aae7c8d988a326018ed72f5b4dd0392e4" - integrity sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ== dependencies: is-number "^7.0.0" @@ -5123,7 +5028,6 @@ type-is@~1.6.15, type-is@~1.6.16: typedarray-to-buffer@^3.1.5: version "3.1.5" resolved "https://registry.yarnpkg.com/typedarray-to-buffer/-/typedarray-to-buffer-3.1.5.tgz#a97ee7a9ff42691b9f783ff1bc5112fe3fca9080" - integrity sha512-zdu8XMNEDepKKR+XYOXAVPtWui0ly0NtohUscw+UmaHiAWT8hrV1rr//H6V+0DvJ3OQ19S979M0laLfX8rm82Q== dependencies: is-typedarray "^1.0.0" @@ -5307,7 +5211,6 @@ which@1.3.1, which@^1.2.9, which@^1.3.0, which@^1.3.1: which@^2.0.1: version "2.0.2" resolved "https://registry.yarnpkg.com/which/-/which-2.0.2.tgz#7c6a8dd0a636a0327e10b59c9286eee93f3f51b1" - integrity sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA== dependencies: isexe "^2.0.0" @@ -5343,7 +5246,6 @@ wrap-ansi@^2.0.0: wrap-ansi@^5.1.0: version "5.1.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-5.1.0.tgz#1fd1f67235d5b6d0fee781056001bfb694c03b09" - integrity sha512-QC1/iN/2/RPVJ5jYK8BGttj5z83LmSKmvbvrXPNCLZSEb32KKVDJDl/MOt2N01qU2H/FkzEa9PKto1BqDjtd7Q== dependencies: ansi-styles "^3.2.0" string-width "^3.0.0" @@ -5352,7 +5254,6 @@ wrap-ansi@^5.1.0: wrap-ansi@^6.2.0: version "6.2.0" resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-6.2.0.tgz#e9393ba07102e6c91a3b221478f0257cd2856e53" - integrity sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA== dependencies: ansi-styles "^4.0.0" string-width "^4.1.0" @@ -5381,7 +5282,6 @@ write-file-atomic@^2.3.0, write-file-atomic@^2.4.3: write-file-atomic@^3.0.0: version "3.0.3" resolved "https://registry.yarnpkg.com/write-file-atomic/-/write-file-atomic-3.0.3.tgz#56bd5c5a5c70481cd19c571bd39ab965a5de56e8" - integrity sha512-AvHcyZ5JnSfq3ioSyjrBkH9yW4m7Ayk8/9My/DD9onKeu/94fwrMocemO2QAJFAlnnDN+ZDS+ZjAR5ua1/PV/Q== dependencies: imurmurhash "^0.1.4" is-typedarray "^1.0.0" @@ -5400,6 +5300,10 @@ ws@^6.0.0: dependencies: async-limiter "~1.0.0" +ws@^7.3.0: + version "7.3.0" + resolved "https://registry.yarnpkg.com/ws/-/ws-7.3.0.tgz#4b2f7f219b3d3737bc1a2fbf145d825b94d38ffd" + xdg-basedir@^3.0.0: version "3.0.0" resolved "https://registry.yarnpkg.com/xdg-basedir/-/xdg-basedir-3.0.0.tgz#496b2cc109eca8dbacfe2dc72b603c17c5870ad4" @@ -5441,7 +5345,6 @@ yallist@^3.0.3: yargs-parser@13.1.2, yargs-parser@^13.1.2: version "13.1.2" resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-13.1.2.tgz#130f09702ebaeef2650d54ce6e3e5706f7a4fb38" - integrity sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg== dependencies: camelcase "^5.0.0" decamelize "^1.2.0" @@ -5449,7 +5352,6 @@ yargs-parser@13.1.2, yargs-parser@^13.1.2: yargs-parser@^18.1.1: version "18.1.2" resolved "https://registry.yarnpkg.com/yargs-parser/-/yargs-parser-18.1.2.tgz#2f482bea2136dbde0861683abea7756d30b504f1" - integrity sha512-hlIPNR3IzC1YuL1c2UwwDKpXlNFBqD1Fswwh1khz5+d8Cq/8yc/Mn0i+rQXduu8hcrFKvO7Eryk+09NecTQAAQ== dependencies: camelcase "^5.0.0" decamelize "^1.2.0" @@ -5463,7 +5365,6 @@ yargs-parser@^9.0.2: yargs-unparser@1.6.0: version "1.6.0" resolved "https://registry.yarnpkg.com/yargs-unparser/-/yargs-unparser-1.6.0.tgz#ef25c2c769ff6bd09e4b0f9d7c605fb27846ea9f" - integrity sha512-W9tKgmSn0DpSatfri0nx52Joq5hVXgeLiqR/5G0sZNDoLZFOr/xjBUDcShCOGNsBnEMNo1KAMBkTej1Hm62HTw== dependencies: flat "^4.1.0" lodash "^4.17.15" @@ -5472,7 +5373,6 @@ yargs-unparser@1.6.0: yargs@13.3.2, yargs@^13.3.0: version "13.3.2" resolved "https://registry.yarnpkg.com/yargs/-/yargs-13.3.2.tgz#ad7ffefec1aa59565ac915f82dccb38a9c31a2dd" - integrity sha512-AX3Zw5iPruN5ie6xGRIDgqkT+ZhnRlZMLMHAs8tg7nRruy2Nb+i5o9bwghAogtM08q1dpr2LVoS8KSTMYpWXUw== dependencies: cliui "^5.0.0" find-up "^3.0.0" @@ -5505,7 +5405,6 @@ yargs@^11.0.0: yargs@^15.0.2: version "15.3.1" resolved "https://registry.yarnpkg.com/yargs/-/yargs-15.3.1.tgz#9505b472763963e54afe60148ad27a330818e98b" - integrity sha512-92O1HWEjw27sBfgmXiixJWT5hRBp2eobqXicLtPBIDBhYB+1HpwZlXmbW2luivBJHBzki+7VyCLRtAkScbTBQA== dependencies: cliui "^6.0.0" decamelize "^1.2.0" diff --git a/src/sdk/pynni/requirements.txt b/src/sdk/pynni/requirements.txt index 885c909ca8..282f572631 100644 --- a/src/sdk/pynni/requirements.txt +++ b/src/sdk/pynni/requirements.txt @@ -1,5 +1,6 @@ # core json_tricks +websockets # hyperopt tuner numpy diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py index 059449c02c..ab52040684 100644 --- a/tools/nni_trial_tool/base_channel.py +++ b/tools/nni_trial_tool/base_channel.py @@ -40,6 +40,23 @@ def __init__(self, args): self.args = args self.node_id = self.args.node_id + @abstractmethod + def _inner_send(self, message): + pass + + @abstractmethod + def _inner_receive(self): + return [] + + @abstractmethod + def _inner_open(self): + pass + + @abstractmethod + def _inner_close(self): + pass + + def open(self): # initialize receive, send threads. self.is_running = True self.receive_queue = Queue() @@ -49,40 +66,19 @@ def __init__(self, args): self.send_thread = threading.Thread(target=self._send_loop) self.send_thread.start() - @abstractmethod - def _inner_send(self, message): - pass - - @abstractmethod - def _inner_receive(self): - return [] - - def _receive_loop(self): - while (self.is_running): - messages = self._inner_receive() - if messages is not None: - for message in messages: - self.receive_queue.put(message) - time.sleep(INTERVAL_SECONDS) + self._inner_open() - def _send_loop(self): - while (self.is_running): - message = None - try: - # no sleep, since it's a block call with INTERVAL_SECONDS second timeout - message = self.send_queue.get(True, INTERVAL_SECONDS) - except Empty: - # do nothing, if no command received. - pass - if message is not None: - if self.node_id is None: - nni_log(LogType.Info, 'Sending command: %s' % message) - else: - nni_log(LogType.Info, 'Sending command(%s): %s' % (self.node_id, message)) - self._inner_send(message) + client_info = { + "isReady": True, + "runnerId": self.args.runner_id, + "expId": self.args.exp_id, + } + nni_log(LogType.Info, 'Channel: send ready information %s' % client_info) + self.send(CommandType.Initialized, client_info) def close(self): self.is_running = False + self._inner_close() def send(self, command, data): """Send command to Training Service. @@ -135,3 +131,49 @@ def receive(self): except Exception as identifier: nni_log(LogType.Error, 'meet unhandled exception in base_channel: %s' % identifier) return command, data + + def _fetch_message(self, buffer, has_new_line=False): + messages = [] + while(len(buffer)) >= 16: + header = buffer[:16] + length = int(header[2:]) + + message_length = length+16 + total_length = message_length + if has_new_line: + total_length += 1 + + # break, if buffer is too short. + if len(buffer) < total_length: + break + data = buffer[16:message_length] + if has_new_line and 10 != buffer[total_length-1]: + nni_log(LogType.Error, 'end of message should be \\n, but got {}'.format(self.in_cache[total_length-1])) + buffer = buffer[total_length:] + messages.append(header + data) + + return messages, buffer + + def _receive_loop(self): + while (self.is_running): + messages = self._inner_receive() + if messages is not None: + for message in messages: + self.receive_queue.put(message) + time.sleep(INTERVAL_SECONDS) + + def _send_loop(self): + while (self.is_running): + message = None + try: + # no sleep, since it's a block call with INTERVAL_SECONDS second timeout + message = self.send_queue.get(True, INTERVAL_SECONDS) + except Empty: + # do nothing, if no command received. + pass + if message is not None: + if self.node_id is None: + nni_log(LogType.Info, 'Sending command: %s' % message) + else: + nni_log(LogType.Info, 'Sending command(%s): %s' % (self.node_id, message)) + self._inner_send(message) diff --git a/tools/nni_trial_tool/file_channel.py b/tools/nni_trial_tool/file_channel.py index d0b6478000..85384c0e6d 100644 --- a/tools/nni_trial_tool/file_channel.py +++ b/tools/nni_trial_tool/file_channel.py @@ -24,8 +24,10 @@ def __init__(self, args): super(FileChannel, self).__init__(args) - def close(self): - super(FileChannel, self).close() + def _inner_open(self): + pass + + def _inner_close(self): if self.out_file is not None: self.out_file.close() self.out_file = None @@ -72,18 +74,5 @@ def _inner_receive(self): if count > 0: self.in_cache += self.in_file.read(count) self.in_offset = new_offset - while(len(self.in_cache)) >= 16: - header = self.in_cache[:16] - length = int(header[2:]) - - # consider there is an \n at end of a message. - total_length = length+16+1 - # break, if buffer is too short. - if len(self.in_cache) < total_length: - break - data = self.in_cache[16:total_length-1] - if 10 != self.in_cache[total_length-1]: - nni_log(LogType.Error, 'end of message should be \\n, but got {}'.format(self.in_cache[total_length-1])) - self.in_cache = self.in_cache[total_length:] - messages.append(header + data) + messages, self.in_cache = self._fetch_message(self.in_cache, True) return messages diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index dbbacc767e..74e0a9d58d 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -27,18 +27,20 @@ def main_loop(args): idle_last_time = datetime.now() gpu_refresh_last_time = datetime.now() - timedelta(minutes=1) - # init command channel - command_channel = None - if args.command_channel == "rest": - command_channel = FileChannel(args) - else: - command_channel = FileChannel(args) - nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) - args.command_channel = command_channel + try: + trials = dict() - trials = dict() + # init command channel + command_channel = None + if args.command_channel == "file": + command_channel = FileChannel(args) + else: + command_channel = WebChannel(args) + command_channel.open() + + nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) + args.command_channel = command_channel - try: # command loop while True: command_type, command_data = command_channel.receive() @@ -77,8 +79,9 @@ def main_loop(args): command_channel.send(CommandType.ReportGpuInfo, gpu_info) gpu_refresh_last_time = datetime.now() time.sleep(0.5) - except Exception: + except Exception as ex: traceback.print_exc() + raise ex finally: nni_log(LogType.Info, "main_loop exits.") @@ -111,26 +114,26 @@ def check_version(args): else: try: trial_runner_version = regular.search(trial_runner_version).group('version') - nni_log(LogType.Info, '{0}: runner_version is {1}'.format(args.runner_name, trial_runner_version)) + nni_log(LogType.Info, '{0}: runner_version is {1}'.format(args.node_id, trial_runner_version)) nni_manager_version = regular.search(args.nni_manager_version).group('version') - nni_log(LogType.Info, '{0}: nni_manager_version is {1}'.format(args.runner_name, nni_manager_version)) + nni_log(LogType.Info, '{0}: nni_manager_version is {1}'.format(args.node_id, nni_manager_version)) log_entry = {} if trial_runner_version != nni_manager_version: - nni_log(LogType.Error, '{0}: Version does not match!'.format(args.runner_name)) + nni_log(LogType.Error, '{0}: Version does not match!'.format(args.node_id)) error_message = '{0}: NNIManager version is {1}, Trial runner version is {2}, NNI version does not match!'.format( - args.runner_name, nni_manager_version, trial_runner_version) + args.node_id, nni_manager_version, trial_runner_version) log_entry['tag'] = 'VCFail' log_entry['msg'] = error_message - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_id), json.dumps(log_entry), 10, + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_name), json.dumps(log_entry), 10, False) os._exit(1) else: - nni_log(LogType.Info, '{0}: Version match!'.format(args.runner_name)) + nni_log(LogType.Info, '{0}: Version match!'.format(args.node_id)) log_entry['tag'] = 'VCSuccess' - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_id), json.dumps(log_entry), 10, + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_name), json.dumps(log_entry), 10, False) except AttributeError as err: - nni_log(LogType.Error, '{0}: {1}'.format(args.runner_name, err)) + nni_log(LogType.Error, '{0}: {1}'.format(args.node_id, err)) def fetch_parameter_file(args): @@ -179,8 +182,9 @@ def run(self): args.exp_id = settings["experimentId"] args.platform = settings["platform"] - # runner_id is unique runner in experiment, and will be updated if it's multi-nodes - args.runner_id = "runner_"+os.path.basename(os.path.realpath(os.path.curdir)) + # runner_id is unique runner in experiment + args.runner_id = os.path.basename(os.path.realpath(os.path.curdir)) + args.runner_name = "runner_"+args.runner_id args.enable_gpu_collect = settings["enableGpuCollector"] args.command_channel = settings["commandChannel"] @@ -210,6 +214,7 @@ def run(self): from .url_utils import gen_parameter_meta_url, gen_send_version_url from .trial import Trial from .file_channel import FileChannel + from .web_channel import WebChannel from .base_channel import CommandType is_multi_node = args.node_count > 1 @@ -224,26 +229,23 @@ def run(self): with open(unique_check_file_name, "w") as unique_check_file: unique_check_file.write("%s" % (int(datetime.now().timestamp() * 1000))) args.node_id = node_id - args.runner_name = "%s_%s" % (args.runner_id, node_id) else: # node id is unique in the runner args.node_id = None - # runner_name is unique node in experiment, and will be updated if it's multi-nodes - args.runner_name = args.runner_id trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'runner', - StdOutputType.Stdout, args.log_collection, args.runner_id) + StdOutputType.Stdout, args.log_collection, args.runner_name) sys.stdout = sys.stderr = trial_runner_syslogger - nni_log(LogType.Info, "{}: merged args is {}".format(args.runner_name, args)) + nni_log(LogType.Info, "{}: merged args is {}".format(args.node_id, args)) if args.trial_command is None: - nni_log(LogType.Error, "{}: no command is found.".format(args.runner_name)) + nni_log(LogType.Error, "{}: no command is found.".format(args.node_id)) os._exit(1) check_version(args) try: main_loop(args) except SystemExit as se: - nni_log(LogType.Info, '{}: NNI trial runner exit with code {}'.format(args.runner_name, se.code)) + nni_log(LogType.Info, '{}: NNI trial runner exit with code {}'.format(args.node_id, se.code)) os._exit(se.code) finally: if trial_runner_syslogger is not None: diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py new file mode 100644 index 0000000000..d386f47b70 --- /dev/null +++ b/tools/nni_trial_tool/web_channel.py @@ -0,0 +1,52 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +import asyncio + +import websockets + +from .base_channel import BaseChannel +from .log_utils import LogType, nni_log + + +class WebChannel(BaseChannel): + + def __init__(self, args): + self.node_id = args.node_id + self.args = args + self.client = None + self.in_cache = b"" + + super(WebChannel, self).__init__(args) + + self._event_loop = None + + def _inner_open(self): + url = "ws://{}:{}".format(self.args.nnimanager_ip, self.args.nnimanager_port) + nni_log(LogType.Info, 'WebChannel: connected with info %s' % url) + + connect = websockets.connect(url) + self._event_loop = asyncio.get_event_loop() + client = self._event_loop.run_until_complete(connect) + self.client = client + + def _inner_close(self): + if self.client is not None: + self._event_loop.run_until_complete(self.client.close()) + self._event_loop.close() + self.client = None + self._event_loop = None + + def _inner_send(self, message): + loop = asyncio.new_event_loop() + loop.run_until_complete(self.client.send(message)) + + def _inner_receive(self): + messages = [] + if self.client is not None: + received = self._event_loop.run_until_complete(self.client.recv()) + # receive message is string, to get consistent result, encode it here. + self.in_cache += received.encode("utf8") + messages, self.in_cache = self._fetch_message(self.in_cache) + + return messages diff --git a/tools/setup.py b/tools/setup.py index d5e527e596..de96a9310e 100644 --- a/tools/setup.py +++ b/tools/setup.py @@ -16,7 +16,8 @@ 'astor', 'schema', 'PythonWebHDFS', - 'colorama' + 'colorama', + 'websockets' ], author = 'Microsoft NNI Team', From 50180397642aea84d6d4b62d74a6bba0212654b3 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Wed, 24 Jun 2020 13:46:25 +0800 Subject: [PATCH 35/98] fix eslint errors, and rename rest to web --- .../reusable/channels/webCommandChannel.ts | 2 +- .../training_service/reusable/environment.ts | 2 +- .../reusable/trialDispatcher.ts | 52 ++++++++++--------- 3 files changed, 29 insertions(+), 27 deletions(-) diff --git a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts index cdb612e39b..0284dfab08 100644 --- a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts @@ -52,7 +52,7 @@ export class WebCommandChannel extends CommandChannel { private clients: Map = new Map(); public get channelName(): Channel { - return "rest"; + return "web"; } public async config(key: string, value: any): Promise { diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index daf03f946f..1d1fb94cbb 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -25,7 +25,7 @@ import { TrialJobStatus } from "../../common/trainingService"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; -export type Channel = "rest" | "file" | "aml" +export type Channel = "web" | "file" | "aml" export abstract class EnvironmentService { diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 0f38c3f85e..ab1a678548 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -504,36 +504,38 @@ class TrialDispatcher implements TrainingService { environment.gpuSummary.set(nodeId, (data)); break; case INITIALIZED: - const oldStatus = environment.status; - let isAllReady = true; - - if (environment.nodeCount > 1) { - let node = environment.nodes.get(nodeId); - if (node === undefined) { - node = new NodeInfomation(nodeId); - environment.nodes.set(nodeId, node); - } - const oldNodeStatus = node.status; - if (oldNodeStatus === "UNKNOWN") { - node.status = "RUNNING"; - } + { + const oldStatus = environment.status; + let isAllReady = true; + + if (environment.nodeCount > 1) { + let node = environment.nodes.get(nodeId); + if (node === undefined) { + node = new NodeInfomation(nodeId); + environment.nodes.set(nodeId, node); + } + const oldNodeStatus = node.status; + if (oldNodeStatus === "UNKNOWN") { + node.status = "RUNNING"; + } - if (environment.nodes.size === environment.nodeCount) { - for (const node of environment.nodes.values()) { - if (node.status !== "RUNNING") { - isAllReady = false; - break; + if (environment.nodes.size === environment.nodeCount) { + for (const node of environment.nodes.values()) { + if (node.status !== "RUNNING") { + isAllReady = false; + break; + } } + } else { + isAllReady = false; } - } else { - isAllReady = false; } - } - // single node is always ready to set env status - if (isAllReady && oldStatus === "UNKNOWN") { - environment.status = "RUNNING"; - this.log.info(`TrialDispatcher: env ${environment.id} received initialized message, old status: ${oldStatus}, new status: ${environment.status}.`); + // single node is always ready to set env status + if (isAllReady && oldStatus === "UNKNOWN") { + environment.status = "RUNNING"; + this.log.info(`TrialDispatcher: env ${environment.id} received initialized message, old status: ${oldStatus}, new status: ${environment.status}.`); + } } break; case TRIAL_END: From 283bcebf890a585c351df796da230e9ae20b66fe Mon Sep 17 00:00:00 2001 From: Chi Song Date: Wed, 24 Jun 2020 15:20:15 +0800 Subject: [PATCH 36/98] remove trial service, as it's replaced by channel. --- .../reusable/channels/webCommandChannel.ts | 3 + .../environments/openPaiEnvironmentService.ts | 4 +- .../reusable/routerTrainingService.ts | 18 +---- .../training_service/reusable/trial.ts | 13 ---- .../reusable/trialDispatcher.ts | 31 +++++---- .../reusable/trials/storageTrialService.ts | 69 ------------------- tools/nni_trial_tool/trial.py | 13 ++++ tools/nni_trial_tool/trial_runner.py | 6 ++ 8 files changed, 43 insertions(+), 114 deletions(-) delete mode 100644 src/nni_manager/training_service/reusable/trials/storageTrialService.ts diff --git a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts index 0284dfab08..09f48bae3f 100644 --- a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts @@ -127,6 +127,9 @@ export class WebCommandChannel extends CommandChannel { runnerConnection.AddClient(client); connection = runnerConnection; isValid = true; + this.log.debug(`WebCommandChannel: client of env ${runnerConnection.environment.id} initialized`); + } else { + this.log.warning(`WebCommandChannel: client is not initialized, runnerId: ${result.runnerId}, command: ${commandType}, expId: ${this.expId}, exists: ${this.runnerConnections.has(result.runnerId)}`); } } diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 9c8662242f..9a463cbb53 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -135,9 +135,9 @@ export class OpenPaiEnvironmentService implements EnvironmentService { const oldEnvironmentStatus = environment.status; switch (jobResponse.state) { case 'RUNNING': - // RUNNING state is set by runner. - break; case 'WAITING': + // RUNNING status is set by runner, and ignore waiting status + break; case 'SUCCEEDED': case 'FAILED': environment.setFinalStatus(jobResponse.state); diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index afbcc4ff9f..35d92fecef 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -19,6 +19,7 @@ 'use strict'; +import { Container, Scope } from 'typescript-ioc'; import * as component from '../../common/component'; import { getLogger, Logger } from '../../common/log'; import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric } from '../../common/trainingService'; @@ -26,16 +27,11 @@ import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; import { PAIK8STrainingService } from '../pai/paiK8S/paiK8STrainingService'; -import { TrialDispatcher } from './trialDispatcher'; -import { Container, Scope } from 'typescript-ioc'; import { EnvironmentService } from './environment'; import { OpenPaiEnvironmentService } from './environments/openPaiEnvironmentService'; -import { StorageService } from './storageService'; import { MountedStorageService } from './storages/mountedStorageService'; -import { TrialService } from './trial'; -import { StorageTrialService } from './trials/storageTrialService'; -import { CommandChannel } from './commandChannel'; -import { FileCommandChannel } from './channels/fileCommandChannel'; +import { StorageService } from './storageService'; +import { TrialDispatcher } from './trialDispatcher'; /** @@ -124,14 +120,6 @@ class RouterTrainingService implements TrainingService { Container.bind(StorageService) .to(MountedStorageService) .scope(Scope.Singleton); - // TODO to support other trialService later. - Container.bind(TrialService) - .to(StorageTrialService) - .scope(Scope.Singleton); - - Container.bind(CommandChannel) - .to(FileCommandChannel) - .scope(Scope.Singleton); } else { this.log.debug(`caching metadata key:{} value:{}, as training service is not determined.`); this.internalTrainingService = component.get(PAIK8STrainingService); diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts index 396d912ff9..65e008b336 100644 --- a/src/nni_manager/training_service/reusable/trial.ts +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -24,19 +24,6 @@ import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../c import { EnvironmentInformation, NodeInfomation } from "./environment"; import { GPUInfo } from "training_service/common/gpuData"; -export abstract class TrialService { - protected readonly log: Logger; - - public abstract config(key: string, value: any): Promise; - public abstract updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise; - public abstract startTrial(trial: TrialDetail): Promise; - public abstract stopTrial(trial: TrialDetail): Promise; - - constructor() { - this.log = getLogger(); - } -} - export class TrialDetail implements TrialJobDetail { public id: string; public status: TrialJobStatus; diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index ab1a678548..2b25742c46 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -27,7 +27,7 @@ import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, getLogLevel, getVersion, uniqueString } from '../../common/utils'; -import { GPU_INFO, INITIALIZED, TRIAL_END } from '../../core/commands'; +import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, SEND_TRIAL_JOB_PARAMETER, TRIAL_END } from '../../core/commands'; import { GPUSummary } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; @@ -35,10 +35,10 @@ import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { validateCodeDir } from '../common/util'; import { WebCommandChannel } from './channels/webCommandChannel'; import { Command, CommandChannel } from './commandChannel'; -import { EnvironmentInformation, EnvironmentService, RunnerSettings, NodeInfomation } from './environment'; +import { EnvironmentInformation, EnvironmentService, NodeInfomation, RunnerSettings } from './environment'; import { JobRestServer } from './jobRestServer'; import { StorageService } from './storageService'; -import { TrialDetail, TrialService } from './trial'; +import { TrialDetail } from './trial'; /** * It uses to manage jobs on training platforms @@ -131,9 +131,16 @@ class TrialDispatcher implements TrainingService { // to support multi phase public async updateTrialJob(trialJobId: string, form: TrialJobApplicationForm): Promise { const trialDetail = await this.getTrialJob(trialJobId); + const environment = trialDetail.environment; + if (environment === undefined) { + throw new Error(`TrialDispatcher: trial ${trialJobId}'s env shouldn't be undefined in updateTrialJob.`); + } - const trialService = component.get(TrialService); - await trialService.updateTrial(trialDetail, form); + const message = { + "trialId": trialJobId, + "parameters": form.hyperParameters, + } + await this.commandChannel.sendCommand(environment, SEND_TRIAL_JOB_PARAMETER, message); return trialDetail; } @@ -147,8 +154,7 @@ class TrialDispatcher implements TrainingService { { const environment = trial.environment; if (environment) { - const trialService = component.get(TrialService); - await trialService.stopTrial(trial); + await this.commandChannel.sendCommand(environment, KILL_TRIAL_JOB, trial.id); trial.isEarlyStopped = isEarlyStopped; trial.status = trial.isEarlyStopped === true ? 'EARLY_STOPPED' : 'USER_CANCELED'; @@ -182,9 +188,6 @@ class TrialDispatcher implements TrainingService { throw new Error(`trial config shouldn't be undefined in run()`); } - const trialService = component.get(TrialService); - trialService.config("channel", this.commandChannel); - const environmentService = component.get(EnvironmentService); if (environmentService.hasStorageService) { this.log.info(`TrialDispatcher: copying code and settings.`); @@ -360,8 +363,7 @@ class TrialDispatcher implements TrainingService { // for example, in horovod, it's just sleep command, has no impact on trial result. if (environment.nodeCount > completedCount) { this.log.info(`stop partial completed trial ${trial.id}`); - const trialService = component.get(TrialService); - await trialService.stopTrial(trial); + await this.commandChannel.sendCommand(environment, KILL_TRIAL_JOB, trial.id); } for (const node of trial.nodes.values()) { if (node.status === "FAILED") { @@ -479,8 +481,7 @@ class TrialDispatcher implements TrainingService { } trial.startTime = Date.now(); trial.status = "RUNNING"; - const trialService = component.get(TrialService); - await trialService.startTrial(trial); + await this.commandChannel.sendCommand(trial.environment, NEW_TRIAL_JOB, trial.settings); } private releaseEnvironment(trial: TrialDetail): void { @@ -515,7 +516,7 @@ class TrialDispatcher implements TrainingService { environment.nodes.set(nodeId, node); } const oldNodeStatus = node.status; - if (oldNodeStatus === "UNKNOWN") { + if (oldNodeStatus === "UNKNOWN" || oldNodeStatus === "WAITING") { node.status = "RUNNING"; } diff --git a/src/nni_manager/training_service/reusable/trials/storageTrialService.ts b/src/nni_manager/training_service/reusable/trials/storageTrialService.ts deleted file mode 100644 index fd5dfdc30f..0000000000 --- a/src/nni_manager/training_service/reusable/trials/storageTrialService.ts +++ /dev/null @@ -1,69 +0,0 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -'use strict'; - -import * as component from "../../../common/component"; -import { TrialJobApplicationForm } from "../../../common/trainingService"; -import { generateParamFileName } from "../../../common/utils"; -import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../../core/commands'; -import { CommandChannel } from "../commandChannel"; -import { StorageService } from "../storageService"; -import { TrialDetail, TrialService } from "../trial"; - -@component.Singleton -export class StorageTrialService extends TrialService { - private commandChannel: CommandChannel | undefined; - - public async config(key: string, value: any): Promise { - switch (key) { - case "channel": - this.commandChannel = value; - break; - } - } - - public async startTrial(trial: TrialDetail): Promise { - if (trial.environment === undefined) { - throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); - } - if (this.commandChannel === undefined) { - throw new Error(`trialService: commandChannel shouldn't be undefined!`); - } - await this.commandChannel.sendCommand(trial.environment, NEW_TRIAL_JOB, trial.settings); - } - - public async stopTrial(trial: TrialDetail): Promise { - if (trial.environment === undefined) { - throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); - } - if (this.commandChannel === undefined) { - throw new Error(`trialService: commandChannel shouldn't be undefined!`); - } - await this.commandChannel.sendCommand(trial.environment, KILL_TRIAL_JOB, trial.id); - } - - public async updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise { - const storageService = component.get(StorageService); - const fileName = storageService.joinPath(trial.workingDirectory, generateParamFileName(form.hyperParameters)) - - // Write file content ( parameter.cfg ) to working folders - await storageService.save(form.hyperParameters.value, fileName); - } -} diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index 63553a4ae4..dd15ca2ec2 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -55,6 +55,7 @@ def run(self): environ['NNI_TRIAL_SEQ_ID'] = str(self.data["sequenceId"]) environ['NNI_OUTPUT_DIR'] = os.path.join(trial_working_dir, "nnioutput") environ['NNI_SYS_DIR'] = trial_working_dir + self.working_dir = trial_working_dir # prepare code and parameters prepared_flag_file_name = os.path.join(trial_working_dir, "trial_prepared") @@ -91,6 +92,18 @@ def run(self): nni_log(LogType.Info, '{0}: spawns a subprocess (pid {1}) to run command: {2}'. format(self.name, self.process.pid, shlex.split(self.args.trial_command))) + def save_parameter_file(self, command_data): + parameters = command_data["parameters"] + file_index = int(parameters["index"]) + if file_index == 0: + parameter_file_name = "parameter.cfg" + else: + parameter_file_name = "parameter_{}.cfg".format(file_index) + parameter_file_name = os.path.join(self.working_dir, parameter_file_name) + with open(parameter_file_name, "w") as parameter_file: + nni_log(LogType.Info, '%s: saving parameter %s' % (self.name, parameters["value"])) + parameter_file.write(parameters["value"]) + def is_running(self): if (self.process is None): return False diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 74e0a9d58d..0dfcc59b00 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -57,7 +57,13 @@ def main_loop(args): elif command_type == CommandType.KillTrialJob: trial_id = command_data if trial_id in trials.keys(): + trial = trials[trial_id] trial.kill(command_data) + elif command_type == CommandType.SendTrialJobParameter: + trial_id = command_data["trialId"] + if trial_id in trials.keys(): + trial = trials[trial_id] + trial.save_parameter_file(command_data) elif command_type is not None: raise Exception("unknown command %s" % command_type) From 671f5d819f4f32f316fa9a3966b9139cece1e663 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Wed, 24 Jun 2020 16:23:21 +0800 Subject: [PATCH 37/98] fix merged problem, and small refine for ut. --- .../training_service/reusable/channels/fileCommandChannel.ts | 4 ++-- .../training_service/reusable/channels/webCommandChannel.ts | 4 ++-- src/nni_manager/training_service/reusable/commandChannel.ts | 4 ++-- src/nni_manager/training_service/reusable/environment.ts | 2 +- src/nni_manager/training_service/reusable/trial.ts | 3 +-- tools/nni_cmd/config_schema.py | 2 +- 6 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts index a5e0aebd74..002a5d5492 100644 --- a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts @@ -53,13 +53,13 @@ export class FileCommandChannel extends CommandChannel { // do nothing } - public start(): void { + public async start(): Promise { // start command loops this.receiveLoop(); this.sendLoop(); } - public stop(): void { + public async stop(): Promise { this.stopping = true; } diff --git a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts index 09f48bae3f..bf6b2166a1 100644 --- a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts @@ -63,7 +63,7 @@ export class WebCommandChannel extends CommandChannel { } } - public start(): void { + public async start(): Promise { if (this.httpServer === undefined) { throw new Error(`http server is not initialized!`); } @@ -81,7 +81,7 @@ export class WebCommandChannel extends CommandChannel { }); } - public stop(): void { + public async stop(): Promise { if (this.webSocketServer !== undefined) { this.webSocketServer.close(); } diff --git a/src/nni_manager/training_service/reusable/commandChannel.ts b/src/nni_manager/training_service/reusable/commandChannel.ts index 96756e67a1..ed9d6d0291 100644 --- a/src/nni_manager/training_service/reusable/commandChannel.ts +++ b/src/nni_manager/training_service/reusable/commandChannel.ts @@ -72,8 +72,8 @@ export abstract class CommandChannel { public abstract get channelName(): Channel; public abstract config(key: string, value: any): Promise; - public abstract start(): void; - public abstract stop(): void; + public abstract start(): Promise; + public abstract stop(): Promise; protected abstract sendCommandInternal(environment: EnvironmentInformation, message: string): Promise; protected abstract createRunnerConnection(environment: EnvironmentInformation): RunnerConnection; diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 1d1fb94cbb..c8f395eaf7 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -25,7 +25,7 @@ import { TrialJobStatus } from "../../common/trainingService"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; -export type Channel = "web" | "file" | "aml" +export type Channel = "web" | "file" | "aml" | "ut"; export abstract class EnvironmentService { diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts index 65e008b336..bc30160af4 100644 --- a/src/nni_manager/training_service/reusable/trial.ts +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -19,10 +19,9 @@ 'use strict'; -import { Logger, getLogger } from "../../common/log"; import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from "../../common/trainingService"; +import { GPUInfo } from "../../training_service/common/gpuData"; import { EnvironmentInformation, NodeInfomation } from "./environment"; -import { GPUInfo } from "training_service/common/gpuData"; export class TrialDetail implements TrialJobDetail { public id: string; diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index e3bb18f957..18c79495ac 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -465,7 +465,7 @@ def validate_pai_config_path(self, experiment_config): if not taskRoles_dict: raise SchemaError('Please set taskRoles in paiConfigPath config file!') else: - pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'paiStoragePlugin', 'command'] + pai_trial_fields_required_list = ['image', 'gpuNum', 'cpuNum', 'memoryMB', 'command'] for trial_field in pai_trial_fields_required_list: if experiment_config['trial'].get(trial_field) is None: raise SchemaError('Please set {0} in trial configuration,\ From a65a81052eb1512cd9cfcba9a4240911b9c67697 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Wed, 24 Jun 2020 17:46:55 +0800 Subject: [PATCH 38/98] fix pylint errors --- tools/nni_trial_tool/base_channel.py | 1 - tools/nni_trial_tool/file_channel.py | 3 --- tools/nni_trial_tool/trial_runner.py | 2 +- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py index ab52040684..6bd73a8c92 100644 --- a/tools/nni_trial_tool/base_channel.py +++ b/tools/nni_trial_tool/base_channel.py @@ -5,7 +5,6 @@ import threading import time from abc import ABC, abstractmethod -from datetime import datetime from enum import Enum from queue import Empty, Queue diff --git a/tools/nni_trial_tool/file_channel.py b/tools/nni_trial_tool/file_channel.py index 85384c0e6d..9a431d25f7 100644 --- a/tools/nni_trial_tool/file_channel.py +++ b/tools/nni_trial_tool/file_channel.py @@ -2,11 +2,8 @@ # Licensed under the MIT license. import os -import time -from datetime import datetime from .base_channel import BaseChannel -from .log_utils import LogType, nni_log command_path = "./commands" runner_commands_file_name_prefix = "runner_commands" diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 0dfcc59b00..f822e6b443 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -96,7 +96,7 @@ def main_loop(args): trial.kill() del trials[trial.id] # wait to send commands - for i in range(10): + for _ in range(10): if command_channel.sent(): break time.sleep(1) From 6d36ae57a8f105b545e981b5029487c4a76a5950 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Fri, 26 Jun 2020 09:23:37 +0800 Subject: [PATCH 39/98] fix lint error --- tools/nni_trial_tool/trial_runner.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index f822e6b443..97d7c363e1 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -47,6 +47,7 @@ def main_loop(args): if command_type == CommandType.NewTrialJob: trial_id = command_data["trialId"] if trial_id in trials.keys(): + trial = trials[trial_id] if trial.is_running(): raise Exception('trial %s is running already, cannot start a new one' % trial.id) else: From 5e352f714fd39f929702f3834c6a95a8e95f20de Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 26 Jun 2020 09:40:54 +0800 Subject: [PATCH 40/98] init --- .../config/aml/checkEnvironment.py | 27 ++++ .../config/aml/createEnvironment.py | 9 +- ...checkEnvieonment.py => sotpEnvironment.py} | 4 +- src/nni_manager/config/aml/uploadFile.py | 30 ++++ .../common/containerJobData.ts | 12 ++ .../reusable/amlEnvironmentService.ts | 56 ++++++-- .../reusable/amlTrialService.ts | 131 ++++++++++++++++++ .../reusable/routerTrainingService.ts | 6 +- .../reusable/trialDispatcher.ts | 25 ++-- 9 files changed, 267 insertions(+), 33 deletions(-) create mode 100644 src/nni_manager/config/aml/checkEnvironment.py rename src/nni_manager/config/aml/{checkEnvieonment.py => sotpEnvironment.py} (93%) create mode 100644 src/nni_manager/config/aml/uploadFile.py create mode 100644 src/nni_manager/training_service/reusable/amlTrialService.ts diff --git a/src/nni_manager/config/aml/checkEnvironment.py b/src/nni_manager/config/aml/checkEnvironment.py new file mode 100644 index 0000000000..a6bc9c519a --- /dev/null +++ b/src/nni_manager/config/aml/checkEnvironment.py @@ -0,0 +1,27 @@ +import os +import time +from argparse import ArgumentParser +from azureml.core import Experiment, RunConfiguration, ScriptRunConfig +from azureml.core.compute import ComputeTarget +from azureml.core.run import RUNNING_STATES, RunStatus, Run +from azureml.core import Workspace +from azureml.core.conda_dependencies import CondaDependencies + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument('--subscription_id', help='the subscription id of aml') + parser.add_argument('--resource_group', help='the resource group of aml') + parser.add_argument('--workspace_name', help='the workspace name of aml') + parser.add_argument('--experiment_name', help='the experiment name') + parser.add_argument('--environment_id', help='the experiment id') + args = parser.parse_args() + + ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) + experiment = Experiment(ws, args.experiment_name) + + run_list = experiment.get_runs() + for run in run_list: + if run.get_details()['runId'] == args.environment_id: + print(run.get_details()['status']) + exit(0) + print('Unknown') diff --git a/src/nni_manager/config/aml/createEnvironment.py b/src/nni_manager/config/aml/createEnvironment.py index e3f50c1bad..479313733d 100644 --- a/src/nni_manager/config/aml/createEnvironment.py +++ b/src/nni_manager/config/aml/createEnvironment.py @@ -22,12 +22,13 @@ ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) compute_target = ComputeTarget(workspace=ws, name=args.computer_target) experiment = Experiment(ws, args.experiment_name) - dependencies = CondaDependencies() - dependencies.add_pip_package("azureml-sdk") - dependencies.add_pip_package("azureml") + #dependencies = CondaDependencies() + #dependencies.add_pip_package("azureml-sdk") + #dependencies.add_pip_package("azureml") run_config = RunConfiguration() - run_config.environment.python.conda_dependencies = dependencies + #run_config.environment.python.conda_dependencies = dependencies + run_config.environment.python.interpreter_path = "/root/miniconda3/bin/python" run_config.environment.docker.enabled = True run_config.environment.docker.base_image = args.docker_image run_config.target = compute_target diff --git a/src/nni_manager/config/aml/checkEnvieonment.py b/src/nni_manager/config/aml/sotpEnvironment.py similarity index 93% rename from src/nni_manager/config/aml/checkEnvieonment.py rename to src/nni_manager/config/aml/sotpEnvironment.py index 84d726a91b..f2eca90082 100644 --- a/src/nni_manager/config/aml/checkEnvieonment.py +++ b/src/nni_manager/config/aml/sotpEnvironment.py @@ -22,6 +22,4 @@ run_list = experiment.get_runs() for run in run_list: if run['runId'] == args.environment_id: - print(run['status']) - return - print('Unknown') + run.cancel() diff --git a/src/nni_manager/config/aml/uploadFile.py b/src/nni_manager/config/aml/uploadFile.py new file mode 100644 index 0000000000..0e9182fde9 --- /dev/null +++ b/src/nni_manager/config/aml/uploadFile.py @@ -0,0 +1,30 @@ +import os +import time +from argparse import ArgumentParser +from azureml.core import Experiment, RunConfiguration, ScriptRunConfig +from azureml.core.compute import ComputeTarget +from azureml.core.run import RUNNING_STATES, RunStatus, Run +from azureml.core import Workspace +from azureml.core.conda_dependencies import CondaDependencies + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument('--subscription_id', help='the subscription id of aml') + parser.add_argument('--resource_group', help='the resource group of aml') + parser.add_argument('--workspace_name', help='the workspace name of aml') + parser.add_argument('--experiment_name', help='the experiment name') + parser.add_argument('--environment_id', help='the experiment id') + parser.add_argument('--remote_file_name', help='the remote file name') + parser.add_argument('--local_file_path', help='the local file path') + args = parser.parse_args() + + ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) + experiment = Experiment(ws, args.experiment_name) + + run_list = experiment.get_runs() + for run in run_list: + if run.get_details()['runId'] == args.environment_id: + run.upload_file(args.remote_file_name, args.local_file_path) + print('succeed') + exit(0) + print('failed') diff --git a/src/nni_manager/training_service/common/containerJobData.ts b/src/nni_manager/training_service/common/containerJobData.ts index f7a29f384a..9dd6996e22 100644 --- a/src/nni_manager/training_service/common/containerJobData.ts +++ b/src/nni_manager/training_service/common/containerJobData.ts @@ -12,3 +12,15 @@ else # Install nni python3 -m pip install --user --upgrade nni fi`; + +export const AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT: string = +`#!/bin/bash +if python3 -c 'import nni' > /dev/null 2>&1; then + # nni module is already installed, skip + return +else + # Install nni + python3 -m pip install --user --no-cache-dir -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nni==1.6777 +fi`; + + diff --git a/src/nni_manager/training_service/reusable/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/amlEnvironmentService.ts index d945ef4856..fa2b6d8e6a 100644 --- a/src/nni_manager/training_service/reusable/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/amlEnvironmentService.ts @@ -50,8 +50,8 @@ const yaml = require('js-yaml'); export class AMLEnvironmentService implements EnvironmentService { private readonly log: Logger = getLogger(); - private amlClusterConfig: AMLClusterConfig | undefined; - private amlTrialConfig: AMLTrialConfig | undefined; + public amlClusterConfig: AMLClusterConfig | undefined; + public amlTrialConfig: AMLTrialConfig | undefined; private amlJobConfig: any; private stopping: boolean = false; private versionCheck: boolean = true; @@ -104,31 +104,29 @@ export class AMLEnvironmentService implements EnvironmentService { } public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { + const deferred: Deferred = new Deferred(); environments.forEach((environment) => { - if (this.amlClusterConfig === undefined) { throw new Error('AML Cluster config is not initialized'); } if (this.amlTrialConfig === undefined) { throw new Error('AML trial config is not initialized'); } - - let pyshell = new PythonShell('createEnvironment.py', { + + let pyshell = new PythonShell('checkEnvironment.py', { scriptPath: './config/aml', pythonOptions: ['-u'], // get print results in real-time args: [ '--subscription_id', this.amlClusterConfig.subscriptionId, '--resource_group', this.amlClusterConfig.resourceGroup, '--workspace_name', this.amlClusterConfig.workspaceName, - '--computer_target', this.amlTrialConfig.computerTarget, '--experiment_name', `nni_exp_${this.experimentId}`, - '--code_dir', environment.environmentLocalTempFolder, - '--script', 'nni_script.py' + '--environment_id', environment.id ] }); pyshell.on('message', function (status: any) { // received a message sent from the Python script (a simple "print" statement) - console.log(`update status ${status}`); + console.log(`------------------get status from aml: ${status}--------------`); switch (status.toUpperCase()) { case 'QUEUED': environment.status = 'WAITING'; @@ -137,7 +135,7 @@ export class AMLEnvironmentService implements EnvironmentService { case 'RUNNING': case 'SUCCEEDED': case 'FAILED': - environment.status = status; + environment.status = status.toUpperCase(); break; case 'STOPPED': case 'STOPPING': @@ -146,14 +144,15 @@ export class AMLEnvironmentService implements EnvironmentService { default: environment.status = 'UNKNOWN'; } + console.log('-------------update environment status to ' + environment.status) }); }); - return; + deferred.resolve(); + return deferred.promise; } public async startEnvironment(environment: EnvironmentInformation): Promise { - const deferred: Deferred = new Deferred(); - + const deferred: Deferred = new Deferred(); if (this.amlClusterConfig === undefined) { throw new Error('AML Cluster config is not initialized'); } @@ -182,11 +181,40 @@ export class AMLEnvironmentService implements EnvironmentService { // received a message sent from the Python script (a simple "print" statement) console.log(envId); environment.id = envId; + deferred.resolve(); }); - return deferred.resolve(); + return deferred.promise; } public async stopEnvironment(environment: EnvironmentInformation): Promise { + const deferred: Deferred = new Deferred(); + + if (this.amlClusterConfig === undefined) { + throw new Error('AML Cluster config is not initialized'); + } + if (this.amlTrialConfig === undefined) { + throw new Error('AML trial config is not initialized'); + } + //TODO: use temp folder + //let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId); + await fs.promises.writeFile(path.join(environment.environmentLocalTempFolder, 'nni_script.py'), environment.command ,{ encoding: 'utf8' }); + let pyshell = new PythonShell('stopEnvironment.py', { + scriptPath: './config/aml', + pythonOptions: ['-u'], // get print results in real-time + args: [ + '--subscription_id', this.amlClusterConfig.subscriptionId, + '--resource_group', this.amlClusterConfig.resourceGroup, + '--workspace_name', this.amlClusterConfig.workspaceName, + '--experiment_name', `nni_exp_${this.experimentId}`, + '--environment_id', environment.id + ] + }); + pyshell.on('message', function (envId: any) { + // received a message sent from the Python script (a simple "print" statement) + console.log(envId); + environment.id = envId; + }); + return deferred.resolve(); } } diff --git a/src/nni_manager/training_service/reusable/amlTrialService.ts b/src/nni_manager/training_service/reusable/amlTrialService.ts new file mode 100644 index 0000000000..0efed002d4 --- /dev/null +++ b/src/nni_manager/training_service/reusable/amlTrialService.ts @@ -0,0 +1,131 @@ +/** + * Copyright (c) Microsoft Corporation + * All rights reserved. + * + * MIT License + * + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated + * documentation files (the "Software"), to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and + * to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING + * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, + * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + +'use strict'; + +import * as fs from 'fs'; +import * as request from 'request'; +import * as path from 'path'; +import { Deferred } from 'ts-deferred'; +import * as component from "../../common/component"; +import { delay, generateParamFileName, getExperimentRootDir } from "../../common/utils"; +import { KILL_TRIAL_JOB, NEW_TRIAL_JOB } from '../../core/commands'; +import { getExperimentId } from '../../common/experimentStartupInfo'; +import { encodeCommand } from "../../core/ipcInterface"; +import { EnvironmentInformation } from "./environment"; +import { TrialDetail, TrialService } from "./trial"; +import { PythonShell } from 'python-shell'; +import { TrialJobApplicationForm } from "../../common/trainingService"; +import { AMLClusterConfig, AMLTrialConfig, AMLTrialJobDetail } from '../aml/amlConfig'; +import { AMLTrainingService } from 'training_service/aml/amlTrainingService'; +import { AMLEnvironmentService } from './amlEnvironmentService'; + +@component.Singleton +export class AMLTrialService extends TrialService { + + private amlClusterConfig: AMLClusterConfig | undefined; + private amlTrialConfig: AMLTrialConfig | undefined; + private experimentId: string; + private amlEnvironmentService: AMLEnvironmentService; + + constructor() { + super(); + this.amlEnvironmentService = component.get(AMLEnvironmentService); + this.amlClusterConfig = this.amlEnvironmentService.amlClusterConfig; + this.amlTrialConfig = this.amlEnvironmentService.amlTrialConfig; + this.experimentId = getExperimentId(); + } + + public async config(_key: string, _value: string): Promise { + return; + } + + public async refreshTrialsStatus(trials: TrialDetail[]): Promise { + for (const trial of trials) { + const currentStatus = trial.status; + // to prevent inconsistent status, skip all non running trials + if (currentStatus !== "RUNNING") { + continue; + } + + const environment = trial.environment; + if (environment === undefined) { + this.log.error(`found running trial ${trial.id} has no environment, set trial to UNKNOWN.`); + trial.status = "UNKNOWN"; + continue; + } + + console.log('--------update trial status-------') + } + } + + public async startTrial(trial: TrialDetail): Promise { + console.log('-----------79 start trial--------') + if (trial.environment === undefined) { + throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); + } + await this.sendCommand(NEW_TRIAL_JOB, trial.settings, trial.environment); + } + + public async stopTrial(trial: TrialDetail): Promise { + if (trial.environment === undefined) { + throw new Error(`trialService: environment of trial ${trial.id} shouldn't be undefined!`); + } + await this.sendCommand(KILL_TRIAL_JOB, trial.id, trial.environment); + } + + public async updateTrial(trial: TrialDetail, form: TrialJobApplicationForm): Promise { + } + + private async sendCommand(commantType: string, data: any, environment: EnvironmentInformation): Promise { + console.log('---------------96 send command0----------') + const deferred: Deferred = new Deferred(); + const command = encodeCommand(commantType, JSON.stringify(data)); + let fileName = `manager_command_${new Date().getTime()}.txt`; + let filePath = path.join(environment.environmentLocalTempFolder, fileName); + await fs.promises.writeFile(filePath, command.toString("utf8"), { encoding: 'utf8' }); + + if (this.amlClusterConfig === undefined) { + throw new Error('AML Cluster config is not initialized'); + } + if (this.amlTrialConfig === undefined) { + throw new Error('AML trial config is not initialized'); + } + + let pyshell = new PythonShell('uploadFile.py', { + scriptPath: './config/aml', + pythonOptions: ['-u'], // get print results in real-time + args: [ + '--subscription_id', this.amlClusterConfig.subscriptionId, + '--resource_group', this.amlClusterConfig.resourceGroup, + '--workspace_name', this.amlClusterConfig.workspaceName, + '--experiment_name', `nni_exp_${this.experimentId}`, + '--environment_id', environment.id, + '--remote_file_name', fileName, + '--local_file_path', filePath, + ] + }); + pyshell.on('message', function (result: any) { + // received a message sent from the Python script (a simple "print" statement) + console.log(`============upload data======${result}`); + deferred.resolve(); + }); + return deferred.promise; + } +} diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index cd62919a04..1ec15fd8a0 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -37,6 +37,7 @@ import { StorageService } from './storageService'; import { MountedStorageService } from './mountedStorageService'; import { TrialService } from './trial'; import { StorageTrialService } from './storageTrialService'; +import { AMLTrialService } from './amlTrialService'; /** @@ -153,11 +154,8 @@ class RouterTrainingService implements TrainingService { Container.bind(EnvironmentService) .to(AMLEnvironmentService) .scope(Scope.Singleton); - Container.bind(StorageService) - .to(MountedStorageService) - .scope(Scope.Singleton); Container.bind(TrialService) - .to(StorageTrialService) + .to(AMLTrialService) .scope(Scope.Singleton); for (const [key, value] of this.metaDataCache) { if (this.internalTrainingService === undefined) { diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 902d0f77c7..f1f1d6eb5f 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -27,7 +27,7 @@ import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, getLogLevel, getVersion, uniqueString, getExperimentRootDir } from '../../common/utils'; -import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT, AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { validateCodeDir, execCopydir, execMkdir } from '../common/util'; @@ -299,11 +299,11 @@ class TrialDispatcher implements TrainingService { toRefreshedTrials.push(trial); } } - + console.log('-------------------trial dispatcher----302-------------') + console.log(toRefreshedTrials.length) if (toRefreshedTrials.length == 0) { continue; } - const trialService = component.get(TrialService); trialService.refreshTrialsStatus(toRefreshedTrials); @@ -361,10 +361,14 @@ class TrialDispatcher implements TrainingService { break; } } - let liveEnvironmentsCount = 0; const idleEnvironments: EnvironmentInformation[] = []; this.environments.forEach((environment) => { + console.log('-----------env status-------') + console.log(environment.id) + console.log(environment.isAlive) + console.log(environment.status) + console.log(environment.isIdle) if (environment.isAlive === true) { liveEnvironmentsCount++; if (environment.status === "RUNNING" && environment.isIdle) { @@ -372,11 +376,14 @@ class TrialDispatcher implements TrainingService { } } }); - + console.log('------------before assign environment---------') + console.log(idleEnvironments.length) + console.log(waitingTrials.length) while (idleEnvironments.length > 0 && waitingTrials.length > 0) { const trial = waitingTrials.shift(); const idleEnvironment = idleEnvironments.shift(); if (trial !== undefined && idleEnvironment != undefined) { + console.log('--------start assign env----------') await this.assignEnvironment(trial, idleEnvironment); } } @@ -416,18 +423,20 @@ class TrialDispatcher implements TrainingService { let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp", envId); await execMkdir(environmentLocalTempFolder); const runnerSettingsPath = path.join(environmentLocalTempFolder, "settings.json"); + this.runnerSettings.command = "python3 test.py"; await fs.promises.writeFile(runnerSettingsPath, JSON.stringify(this.runnerSettings), { encoding: 'utf8' }); const installFilePath = path.join(environmentLocalTempFolder, "install_nni.sh"); - await fs.promises.writeFile(installFilePath, CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); - environment.command = `import os\nos.system('sh install_nni.sh && python3 -m nni_trial_tool.trial_runner')`; + await fs.promises.writeFile(installFilePath, AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); + environment.command = `import os\nos.system('sh install_nni.sh && cd code && python3 -m nni_trial_tool.trial_runner')`; environment.environmentLocalTempFolder = environmentLocalTempFolder; let environmentLocalTempTrialFolder = path.join(environmentLocalTempFolder, 'code'); await execMkdir(environmentLocalTempTrialFolder); await execCopydir(this.trialConfig.codeDir, environmentLocalTempTrialFolder); } - this.environments.set(environment.id, environment); await environmentService.startEnvironment(environment); + console.log('--------------finish start experiment-----' + environment.id) + this.environments.set(environment.id, environment); if (environment.status === "FAILED") { environment.isIdle = false; From a3a91d8faf47d042309dfd13f9a8b64b3435fb7d Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 29 Jun 2020 01:05:49 +0800 Subject: [PATCH 41/98] format --- src/nni_manager/config/aml/amlUtil.py | 4 +- .../common/containerJobData.ts | 2 +- .../reusable/{ => aml}/amlData.ts | 23 +++++++- .../reusable/channels/amlCommandChannel.ts | 55 +++++++++++++++---- .../environments/amlEnvironmentService.ts | 14 +---- .../reusable/trialDispatcher.ts | 29 +++------- src/sdk/pynni/nni/platform/__init__.py | 2 +- tools/nni_trial_tool/aml_channel.py | 13 ++++- tools/nni_trial_tool/log_utils.py | 7 +++ tools/nni_trial_tool/trial.py | 6 +- 10 files changed, 105 insertions(+), 50 deletions(-) rename src/nni_manager/training_service/reusable/{ => aml}/amlData.ts (84%) diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index 42add082ce..b4910e9ea0 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -42,11 +42,13 @@ line = sys.stdin.readline().rstrip() if line == 'update_status': print('status:' + run.get_status()) + elif line == 'tracking_url': + print('tracking_url:' + run.get_portal_url()) elif line == 'stop': run.cancel() exit(0) elif line == 'receive': - print(run.get_metrics()) + print('receive:' + json.dumps(run.get_metrics())) elif line: items = line.split(':') if items[0] == 'command': diff --git a/src/nni_manager/training_service/common/containerJobData.ts b/src/nni_manager/training_service/common/containerJobData.ts index e8a2ea6f6f..17a482ae33 100644 --- a/src/nni_manager/training_service/common/containerJobData.ts +++ b/src/nni_manager/training_service/common/containerJobData.ts @@ -20,7 +20,7 @@ if python3 -c 'import nni' > /dev/null 2>&1; then return else # Install nni - python3 -m pip install --user --no-cache-dir -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nni==1.680 + python3 -m pip install --user --no-cache-dir -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nni==1.63473 fi`; diff --git a/src/nni_manager/training_service/reusable/amlData.ts b/src/nni_manager/training_service/reusable/aml/amlData.ts similarity index 84% rename from src/nni_manager/training_service/reusable/amlData.ts rename to src/nni_manager/training_service/reusable/aml/amlData.ts index a18b2060ec..6a83a22697 100644 --- a/src/nni_manager/training_service/reusable/amlData.ts +++ b/src/nni_manager/training_service/reusable/aml/amlData.ts @@ -35,6 +35,7 @@ export class AMLClient { public pythonShellClient: undefined | PythonShell; public codeDir: string; public computerTarget: string; + private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?.*?)'`; constructor( subscriptionId: string, @@ -86,6 +87,23 @@ export class AMLClient { this.pythonShellClient.send('stop'); } + public getTrackingUrl(): Promise { + const deferred: Deferred = new Deferred(); + if (this.pythonShellClient === undefined) { + throw Error('python shell client not initialized!'); + } + this.pythonShellClient.send('tracking_url'); + let trackingUrl = ''; + this.pythonShellClient.on('message', function (status: any) { + let items = status.split(':'); + if (items[0] === 'tracking_url') { + trackingUrl = items.splice(1, items.length).join('') + } + deferred.resolve(trackingUrl); + }); + return deferred.promise; + } + public updateStatus(oldStatus: string): Promise { const deferred: Deferred = new Deferred(); if (this.pythonShellClient === undefined) { @@ -117,7 +135,10 @@ export class AMLClient { } this.pythonShellClient.send('receive'); this.pythonShellClient.on('message', function (command: any) { - deferred.resolve(command); + let items = command.split(':') + if (items[0] === 'receive') { + deferred.resolve(JSON.parse(command.slice(8))) + } }); return deferred.promise; } diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index c3873bb4b6..c5e19ddcea 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -23,19 +23,25 @@ import * as component from "../../../common/component"; import { delay } from "../../../common/utils"; import { CommandChannel, RunnerConnection } from "../commandChannel"; import { EnvironmentInformation, Channel } from "../environment"; -import { AMLClient } from "../amlData"; +import { EventEmitter } from 'events'; class AMLRunnerConnection extends RunnerConnection { } export class AMLCommandChannel extends CommandChannel { - private readonly commandPath = "commands"; private stopping: boolean = false; private currentMessageIndex: number = -1; + private currentMetricIndex: number = -1; // make sure no concurrent issue when sending commands. private sendQueues: [EnvironmentInformation, string][] = []; - + private readonly metricEmitter: EventEmitter; + private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?.*?)'`; + + public constructor(commandEmitter: EventEmitter, metricsEmitter: EventEmitter) { + super(commandEmitter); + this.metricEmitter = metricsEmitter; + } public get channelName(): Channel { return "aml"; } @@ -76,10 +82,9 @@ export class AMLCommandChannel extends CommandChannel { const environment = item[0]; const message = item[1]; const amlClient = environment.environmentClient; - amlClient.sendCommand(`b'${message}'`); + amlClient.sendCommand(message); // send command sendCount += 1; - console.log(`----------sending command ${sendCount} b'${message}'`) } } @@ -100,24 +105,35 @@ export class AMLCommandChannel extends CommandChannel { for (const runnerConnection of runnerConnections) { // to loop all commands const amlClient = runnerConnection.environment.environmentClient; - const command = await amlClient.receiveCommand(); + let command = await amlClient.receiveCommand(); if (command && command.hasOwnProperty('trial_runner')) { let messages = command['trial_runner']; if (messages) { if (messages instanceof Object && this.currentMessageIndex < messages.length - 1) { for (let index = this.currentMessageIndex + 1; index < messages.length; index ++) { - console.log(`---------------handle command ${messages[index]}`) - this.handleCommand(runnerConnection.environment, messages[index]); + this.handleCommand(runnerConnection.environment, messages[index].toString()); } this.currentMessageIndex = messages.length - 1; } else if (this.currentMessageIndex === -1){ - console.log(`---------------handle command ${messages}`) - this.handleCommand(runnerConnection.environment, messages); + this.handleCommand(runnerConnection.environment, messages.toString()); this.currentMessageIndex += 1; } } + } + if (command && command.hasOwnProperty('trial_runner_sdk')) { + let messages = command['trial_runner_sdk']; + if (messages) { + if (messages instanceof Object && this.currentMetricIndex < messages.length - 1) { + for (let index = this.currentMetricIndex + 1; index < messages.length; index ++) { + this.handleTrialMetrics(messages[index].toString()); + } + this.currentMetricIndex = messages.length - 1; + } else if (this.currentMetricIndex === -1){ + this.handleTrialMetrics(messages.toString()); + this.currentMetricIndex += 1; + } + } } - } const end = new Date(); @@ -127,4 +143,21 @@ export class AMLCommandChannel extends CommandChannel { } } } + + private handleTrialMetrics(message: string): void { + console.log('-------handle trial metric------' + message) + let messageObj = JSON.parse(message); + let trialId = messageObj['trialId']; + let msg = messageObj['msg']; + const metricsContent: any = msg.match(this.NNI_METRICS_PATTERN); + if (metricsContent && metricsContent.groups) { + const key: string = 'metrics'; + const metric = metricsContent.groups[key]; + console.log(`-----get ${metric} for trial ${trialId}------`); + this.metricEmitter.emit('metric', { + id: trialId, + data: metric + }); + } + } } diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index a7d6ffcdf0..337768c92a 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -20,7 +20,6 @@ 'use strict'; import * as fs from 'fs'; -import * as request from 'request'; import * as path from 'path'; import { Deferred } from 'ts-deferred'; import * as component from '../../../common/component'; @@ -29,9 +28,7 @@ import { getLogger, Logger } from '../../../common/log'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; import { AMLClusterConfig, AMLTrialConfig, AMLTrialJobDetail } from '../../aml/amlConfig'; import { EnvironmentInformation, EnvironmentService } from '../environment'; -import { StorageService } from '../storageService'; -import { PythonShell } from 'python-shell'; -import { AMLClient } from '../amlData'; +import { AMLClient } from '../aml/amlData'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric @@ -41,16 +38,13 @@ import { delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, getVersion, uniqueString } from '../../../common/utils'; -import { AMLCommandChannel } from '../channels/amlCommandChannel'; - -const yaml = require('js-yaml'); /** * Collector PAI jobs info from PAI cluster, and update pai job status locally */ @component.Singleton export class AMLEnvironmentService implements EnvironmentService { - + private readonly log: Logger = getLogger(); public amlClusterConfig: AMLClusterConfig | undefined; public amlTrialConfig: AMLTrialConfig | undefined; @@ -107,11 +101,9 @@ export class AMLEnvironmentService implements EnvironmentService { public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { const deferred: Deferred = new Deferred(); - console.log('-----------line 110--------------------') environments.forEach(async (environment) => { let amlClient = environment.environmentClient; let status = await amlClient.updateStatus(environment.status); - console.log(`------------------get status from aml: ${status}--------------`); switch (status.toUpperCase()) { case 'QUEUED': environment.status = 'WAITING'; @@ -132,7 +124,6 @@ export class AMLEnvironmentService implements EnvironmentService { default: environment.status = 'UNKNOWN'; } - console.log('-------------update environment status to ' + environment.status) }); deferred.resolve(); return deferred.promise; @@ -157,6 +148,7 @@ export class AMLEnvironmentService implements EnvironmentService { environment.environmentLocalTempFolder ); environment.id = await amlClient.submit(); + environment.trackingUrl = await amlClient.getTrackingUrl(); environment.environmentClient = amlClient; } diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index e7c71b7765..9e3f37ca7d 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -32,7 +32,7 @@ import { GPUSummary } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT, AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { validateCodeDir, execMkdir, execCopydir } from '../common/util'; +import { validateCodeDir, execMkdir, execCopydir, tarAdd } from '../common/util'; import { WebCommandChannel } from './channels/webCommandChannel'; import { AMLCommandChannel } from './channels/amlCommandChannel'; import { Command, CommandChannel } from './commandChannel'; @@ -82,7 +82,7 @@ class TrialDispatcher implements TrainingService { this.commandEmitter = new EventEmitter(); if (this.runnerSettings.platform === 'aml') { - this.commandChannel = new AMLCommandChannel(this.commandEmitter); + this.commandChannel = new AMLCommandChannel(this.commandEmitter, this.metricsEmitter); } else { this.commandChannel = new WebCommandChannel(this.commandEmitter); } @@ -175,9 +175,11 @@ class TrialDispatcher implements TrainingService { public async run(): Promise { - await this.jobRestServer.start(); + if (this.runnerSettings.platform !== 'aml') { + await this.jobRestServer.start(); + this.log.info(`TrialDispatcher: rest server listening on: ${this.jobRestServer.endPoint}`); + } this.jobRestServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`TrialDispatcher: rest server listening on: ${this.jobRestServer.endPoint}`); this.runnerSettings.nniManagerPort = this.jobRestServer.clusterRestServerPort; this.runnerSettings.commandChannel = this.commandChannel.channelName; @@ -338,8 +340,6 @@ class TrialDispatcher implements TrainingService { toRefreshedTrials.push(trial); } } - console.log('-------------------trial dispatcher----302-------------') - console.log(toRefreshedTrials.length) if (toRefreshedTrials.length == 0) { continue; } @@ -411,11 +411,6 @@ class TrialDispatcher implements TrainingService { let liveEnvironmentsCount = 0; const idleEnvironments: EnvironmentInformation[] = []; this.environments.forEach((environment) => { - console.log('-----------env status-------') - console.log(environment.id) - console.log(environment.isAlive) - console.log(environment.status) - console.log(environment.isIdle) if (environment.isAlive === true) { liveEnvironmentsCount++; if (environment.status === "RUNNING" && environment.isIdle) { @@ -423,14 +418,10 @@ class TrialDispatcher implements TrainingService { } } }); - console.log('------------before assign environment---------') - console.log(idleEnvironments.length) - console.log(waitingTrials.length) while (idleEnvironments.length > 0 && waitingTrials.length > 0) { const trial = waitingTrials.shift(); const idleEnvironment = idleEnvironments.shift(); if (trial !== undefined && idleEnvironment != undefined) { - console.log('--------start assign env----------') await this.assignEnvironment(trial, idleEnvironment); } } @@ -470,15 +461,13 @@ class TrialDispatcher implements TrainingService { let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp", envId); await execMkdir(environmentLocalTempFolder); const runnerSettingsPath = path.join(environmentLocalTempFolder, "settings.json"); - this.runnerSettings.command = `python3 ${this.trialConfig.command}`; + this.runnerSettings.command = this.trialConfig.command; await fs.promises.writeFile(runnerSettingsPath, JSON.stringify(this.runnerSettings), { encoding: 'utf8' }); const installFilePath = path.join(environmentLocalTempFolder, "install_nni.sh"); await fs.promises.writeFile(installFilePath, AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); - environment.command = `import os\nos.system('sh install_nni.sh && cd code && python3 -m nni_trial_tool.trial_runner')`; + environment.command = `import os\nos.system('sh install_nni.sh && mkdir ${this.experimentId} && cd ${this.experimentId} && python3 -m nni_trial_tool.trial_runner')`; environment.environmentLocalTempFolder = environmentLocalTempFolder; - let environmentLocalTempTrialFolder = path.join(environmentLocalTempFolder, 'code'); - await execMkdir(environmentLocalTempTrialFolder); - await execCopydir(this.trialConfig.codeDir, environmentLocalTempTrialFolder); + await tarAdd(path.join(environmentLocalTempFolder, 'nni-code.tar.gz'), this.trialConfig.codeDir); } await environmentService.startEnvironment(environment); diff --git a/src/sdk/pynni/nni/platform/__init__.py b/src/sdk/pynni/nni/platform/__init__.py index f4251bd5c7..84f04a9862 100644 --- a/src/sdk/pynni/nni/platform/__init__.py +++ b/src/sdk/pynni/nni/platform/__init__.py @@ -9,7 +9,7 @@ from .standalone import * elif trial_env_vars.NNI_PLATFORM == 'unittest': from .test import * -elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts'): +elif trial_env_vars.NNI_PLATFORM in ('local', 'remote', 'pai', 'kubeflow', 'frameworkcontroller', 'paiYarn', 'dlts', 'aml'): from .local import * else: raise RuntimeError('Unknown platform %s' % trial_env_vars.NNI_PLATFORM) diff --git a/tools/nni_trial_tool/aml_channel.py b/tools/nni_trial_tool/aml_channel.py index 70c68702a3..36642d9729 100644 --- a/tools/nni_trial_tool/aml_channel.py +++ b/tools/nni_trial_tool/aml_channel.py @@ -22,7 +22,13 @@ def _inner_close(self): pass def _inner_send(self, message): - self.run.log('trial_runner', str(message)) + try: + if type(message) is bytes: + self.run.log('trial_runner', message.decode('utf8')) + else: + self.run.log('trial_runner', message) + except Exception as exception: + nni_log(LogType.Error, 'meet unhandled exception when send message: %s' % exception) def _inner_receive(self): messages = [] @@ -38,4 +44,7 @@ def _inner_receive(self): elif self.current_message_index == -1: messages = [message_list] self.current_message_index += 1 - return messages + newMessage = [] + for message in messages: + newMessage.append(message.encode('utf8')) + return newMessage diff --git a/tools/nni_trial_tool/log_utils.py b/tools/nni_trial_tool/log_utils.py index 373fb923f1..08dc9d23a9 100644 --- a/tools/nni_trial_tool/log_utils.py +++ b/tools/nni_trial_tool/log_utils.py @@ -10,12 +10,14 @@ import threading import re +from azureml.core.run import Run from datetime import datetime from enum import Enum, unique from logging import StreamHandler from queue import Queue +from .constants import NNI_PLATFORM from .rest_utils import rest_post from .url_utils import gen_send_stdout_url @@ -52,16 +54,21 @@ def __init__(self, host, port, tag, trial_id, std_output_type=StdOutputType.Stdo self.trial_id = trial_id self.orig_stdout = sys.__stdout__ self.orig_stderr = sys.__stderr__ + if NNI_PLATFORM == 'aml': + self.run = Run.get_context() def emit(self, record): log_entry = {} log_entry['tag'] = self.tag + log_entry['trialId'] = self.trial_id log_entry['stdOutputType'] = self.std_output_type.name log_entry['msg'] = self.format(record) try: if self.host: rest_post(gen_send_stdout_url(self.host, self.port, self.trial_id), json.dumps(log_entry), 10, True) + elif NNI_PLATFORM == 'aml' and self.tag == 'trial': + self.run.log('trial_runner_sdk', json.dumps(log_entry)) except Exception as e: self.orig_stderr.write(str(e) + '\n') self.orig_stderr.flush() diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index dd15ca2ec2..18df40aa89 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -45,8 +45,10 @@ def run(self): self.args.log_collection, self.id) nni_log(LogType.Info, "%s: start to run trial" % self.name) - - trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) + if self.args.platform == 'aml': + trial_working_dir = os.path.realpath(os.path.join(os.curdir, "trials", self.id)) + else: + trial_working_dir = os.path.realpath(os.path.join(os.curdir, "..", "..", "trials", self.id)) self.trial_output_dir = os.path.join(trial_working_dir, trial_output_path_name) trial_code_dir = os.path.join(trial_working_dir, "code") trial_nnioutput_dir = os.path.join(trial_working_dir, "nnioutput") From 69a51705c51dd7efaa6c1e3fd2d1bfc388465aef Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 08:51:52 +0800 Subject: [PATCH 42/98] remove useless deferred. --- .../remote_machine/remoteMachineTrainingService.ts | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 533bb79e21..44451a095e 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -406,10 +406,8 @@ class RemoteMachineTrainingService implements TrainingService { private async setupConnections(machineList: string): Promise { this.log.debug(`Connecting to remote machines: ${machineList}`); - const deferred: Deferred = new Deferred(); //TO DO: verify if value's format is wrong, and json parse failed, how to handle error const rmMetaList: RemoteMachineMeta[] = JSON.parse(machineList); - let connectedRMNum: number = 0; const connectionPromises = []; for (const rmMeta of rmMetaList) { @@ -422,13 +420,9 @@ class RemoteMachineTrainingService implements TrainingService { this.log.debug(`initializing ${executor.name}`); connectionPromises.push(this.initRemoteMachineOnConnected(rmMeta, executor)); this.log.info(`connected to ${executor.name}`); - if (++connectedRMNum === rmMetaList.length) { - deferred.resolve(); - } } - Promise.all(connectionPromises); - return deferred.promise; + Promise.all(connectionPromises); } private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, executor: ShellExecutor): Promise { From edc4608eb55021fe9dadb19b26442b1f732a7a39 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 29 Jun 2020 09:41:45 +0800 Subject: [PATCH 43/98] fix package --- deployment/pypi/setup.py | 2 ++ setup.py | 2 ++ src/nni_manager/config/aml/amlUtil.py | 5 ----- tools/setup.py | 2 ++ 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/deployment/pypi/setup.py b/deployment/pypi/setup.py index 61f7ff0178..8985e90ead 100644 --- a/deployment/pypi/setup.py +++ b/deployment/pypi/setup.py @@ -50,6 +50,8 @@ package_data = {'nni': ['**/requirements.txt']}, python_requires = '>=3.5', install_requires = [ + 'azureml', + 'azureml-sdk', 'schema', 'ruamel.yaml', 'psutil', diff --git a/setup.py b/setup.py index 30d4f448c6..c1da2ddf08 100644 --- a/setup.py +++ b/setup.py @@ -30,6 +30,8 @@ def read(fname): python_requires = '>=3.5', install_requires = [ 'astor', + 'azureml', + 'azureml-sdk', 'hyperopt==0.1.2', 'json_tricks', 'netifaces', diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index b4910e9ea0..7b0a4a8063 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -24,13 +24,8 @@ ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) compute_target = ComputeTarget(workspace=ws, name=args.computer_target) experiment = Experiment(ws, args.experiment_name) - dependencies = CondaDependencies() - dependencies.add_pip_package("azureml-sdk") - dependencies.add_pip_package("azureml") run_config = RunConfiguration() - run_config.environment.python.conda_dependencies = dependencies - run_config.environment.python.interpreter_path = "/root/miniconda3/bin/python" run_config.environment.docker.enabled = True run_config.environment.docker.base_image = args.docker_image run_config.target = compute_target diff --git a/tools/setup.py b/tools/setup.py index 48d6923dca..7e1330f2a7 100644 --- a/tools/setup.py +++ b/tools/setup.py @@ -10,6 +10,8 @@ python_requires = '>=3.5', install_requires = [ + 'azureml', + 'azureml-sdk', 'requests', 'ruamel.yaml', 'psutil', From c1f0239655c06ce9931cb77336121ceabd5438cd Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 12:10:19 +0800 Subject: [PATCH 44/98] fix incorrect check logic --- .../reusable/environments/openPaiEnvironmentService.ts | 2 -- src/nni_manager/training_service/reusable/storageService.ts | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 9a463cbb53..3791f7dd9e 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -90,8 +90,6 @@ export class OpenPaiEnvironmentService implements EnvironmentService { } break; } - case TrialConfigMetadataKey.MULTI_PHASE: - break; default: this.log.debug(`OpenPAI not proccessed metadata key: '${key}', value: '${value}'`); } diff --git a/src/nni_manager/training_service/reusable/storageService.ts b/src/nni_manager/training_service/reusable/storageService.ts index 2287e37b9f..b5a019acb9 100644 --- a/src/nni_manager/training_service/reusable/storageService.ts +++ b/src/nni_manager/training_service/reusable/storageService.ts @@ -71,7 +71,7 @@ export abstract class StorageService { localPath = this.expandPath(false, localPath); remotePath = this.expandPath(true, remotePath); this.logger.debug(`copy localPath: ${localPath} to remotePath: ${remotePath}, asGzip ${asGzip}`); - if (!await this.exists(remotePath)) { + if (!await this.internalExists(remotePath)) { await this.internalMkdir(remotePath); } From af97bb157360edf2a12894bbb0a9ee521fb2e214 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 12:33:13 +0800 Subject: [PATCH 45/98] make license header consistent --- .../pai/paiK8S/paiK8SConfig.ts | 20 ++----------------- .../training_service/pai/paiK8S/paiK8SData.ts | 20 ++----------------- .../pai/paiK8S/paiK8STrainingService.ts | 20 ++----------------- .../reusable/channels/fileCommandChannel.ts | 20 ++----------------- .../reusable/channels/webCommandChannel.ts | 20 ++----------------- .../reusable/commandChannel.ts | 20 ++----------------- .../training_service/reusable/environment.ts | 20 ++----------------- .../environments/openPaiEnvironmentService.ts | 20 ++----------------- .../reusable/jobRestServer.ts | 20 ++----------------- .../reusable/routerTrainingService.ts | 20 ++----------------- .../reusable/storageService.ts | 20 ++----------------- .../storages/mountedStorageService.ts | 20 ++----------------- .../training_service/reusable/trial.ts | 20 ++----------------- .../reusable/trialDispatcher.ts | 20 ++----------------- 14 files changed, 28 insertions(+), 252 deletions(-) diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts index 3cb7aac64f..216832f71a 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SConfig.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; import {TrialConfig} from '../../common/trialConfig'; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts index 7851ad7c84..2c6b9f3d66 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8SData.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts index 8d8560e9d2..e243387d39 100644 --- a/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts +++ b/src/nni_manager/training_service/pai/paiK8S/paiK8STrainingService.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts index 002a5d5492..3c5149603a 100644 --- a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts index bf6b2166a1..d380d75a64 100644 --- a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/commandChannel.ts b/src/nni_manager/training_service/reusable/commandChannel.ts index ed9d6d0291..a20b6a1b38 100644 --- a/src/nni_manager/training_service/reusable/commandChannel.ts +++ b/src/nni_manager/training_service/reusable/commandChannel.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index c8f395eaf7..252c805d8a 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index 3791f7dd9e..df6a8a825c 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/jobRestServer.ts b/src/nni_manager/training_service/reusable/jobRestServer.ts index a0871d9ade..cb5c96fa44 100644 --- a/src/nni_manager/training_service/reusable/jobRestServer.ts +++ b/src/nni_manager/training_service/reusable/jobRestServer.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 35d92fecef..06280a8124 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/storageService.ts b/src/nni_manager/training_service/reusable/storageService.ts index b5a019acb9..75b64c2606 100644 --- a/src/nni_manager/training_service/reusable/storageService.ts +++ b/src/nni_manager/training_service/reusable/storageService.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts b/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts index 6b0597e0a7..a3e592e74c 100644 --- a/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts +++ b/src/nni_manager/training_service/reusable/storages/mountedStorageService.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. import * as fs from 'fs'; import * as path from 'path'; diff --git a/src/nni_manager/training_service/reusable/trial.ts b/src/nni_manager/training_service/reusable/trial.ts index bc30160af4..7f1c6323a5 100644 --- a/src/nni_manager/training_service/reusable/trial.ts +++ b/src/nni_manager/training_service/reusable/trial.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 2b25742c46..2bf4bc4329 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. 'use strict'; From c00cd318f620cdc1bf654ccf65b294535593a0ef Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 12:37:22 +0800 Subject: [PATCH 46/98] add missed await. --- .../remote_machine/remoteMachineTrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts index 44451a095e..15cea1ab4a 100644 --- a/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts +++ b/src/nni_manager/training_service/remote_machine/remoteMachineTrainingService.ts @@ -422,7 +422,7 @@ class RemoteMachineTrainingService implements TrainingService { this.log.info(`connected to ${executor.name}`); } - Promise.all(connectionPromises); + await Promise.all(connectionPromises); } private async initRemoteMachineOnConnected(rmMeta: RemoteMachineMeta, executor: ShellExecutor): Promise { From 78f13862f43b0f56e08199ab48a10eae1270edb9 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Mon, 29 Jun 2020 13:04:38 +0800 Subject: [PATCH 47/98] add doc and example --- docs/en_US/TrainingService/AMLMode.md | 60 ++++++++++++++++++ .../TrainingService/SupportTrainingService.md | 3 +- docs/img/aml_account.png | Bin 0 -> 25247 bytes examples/trials/mnist-tfv1/config_aml.yml | 25 ++++++++ src/nni_manager/config/aml/amlUtil.py | 5 +- .../common/containerJobData.ts | 12 ---- .../reusable/aml/{amlData.ts => amlClient.ts} | 6 +- .../{ => reusable}/aml/amlConfig.ts | 4 +- .../environments/amlEnvironmentService.ts | 5 +- .../reusable/routerTrainingService.ts | 2 +- .../reusable/trialDispatcher.ts | 4 +- 11 files changed, 103 insertions(+), 23 deletions(-) create mode 100644 docs/en_US/TrainingService/AMLMode.md create mode 100644 docs/img/aml_account.png create mode 100644 examples/trials/mnist-tfv1/config_aml.yml rename src/nni_manager/training_service/reusable/aml/{amlData.ts => amlClient.ts} (96%) rename src/nni_manager/training_service/{ => reusable}/aml/amlConfig.ts (95%) diff --git a/docs/en_US/TrainingService/AMLMode.md b/docs/en_US/TrainingService/AMLMode.md new file mode 100644 index 0000000000..f9ae38bcf5 --- /dev/null +++ b/docs/en_US/TrainingService/AMLMode.md @@ -0,0 +1,60 @@ +**Run an Experiment on Azure Machine Learning** +=== +NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) , called aml mode. Before starting to use NNI pai mode, you should have an account to access an [OpenPAI](https://github.com/Microsoft/pai) cluster. See [here](https://github.com/Microsoft/pai#how-to-deploy) if you don't have any OpenPAI account and want to deploy an OpenPAI cluster. In pai mode, your trial program will run in pai's container created by Docker. + +## Setup environment +Step 1. Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). + +Step 2. Create AML account, follow the document [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli) + +Step 3. Get your account information. +![](../../img/aml_account.png) + +## Run an experiment +Use `examples/trials/mnist-tfv1` as an example. The NNI config YAML file's content is like: + +```yaml +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +trainingServicePlatform: aml +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + computerTarget: ussc40rscl + nodeCount: 1 +amlConfig: + subscriptionId: ${replace_to_your_subscriptionId} + resourceGroup: ${replace_to_your_resourceGroup} + workspaceName: ${replace_to_your_workspaceName} + +``` + +Note: You should set `trainingServicePlatform: aml` in NNI config YAML file if you want to start experiment in aml mode. + +Compared with [LocalMode](LocalMode.md) trial configuration in aml mode have these additional keys: +* computerTarget + * required key. The computer cluster name you want to use in your AML workspace. +* nodeCount + * required key. The node count each run in your experiment. + +amlConfig: +* subscriptionId + * the subscriptionId of your account +* resourceGroup + * the resourceGroup of your account +* workspaceName + * the workspaceName of your account + \ No newline at end of file diff --git a/docs/en_US/TrainingService/SupportTrainingService.md b/docs/en_US/TrainingService/SupportTrainingService.md index ca2b9283fc..0076b4f52a 100644 --- a/docs/en_US/TrainingService/SupportTrainingService.md +++ b/docs/en_US/TrainingService/SupportTrainingService.md @@ -1,6 +1,6 @@ # TrainingService -NNI TrainingService provides the training platform for running NNI trial jobs. NNI supports [local](./LocalMode.md), [remote](./RemoteMachineMode.md), [pai](./PaiMode.md), [kubeflow](./KubeflowMode.md) and [frameworkcontroller](./FrameworkControllerMode.md) built-in training services. +NNI TrainingService provides the training platform for running NNI trial jobs. NNI supports [local](./LocalMode.md), [remote](./RemoteMachineMode.md), [pai](./PaiMode.md), [aml](./AMLMode.md), [kubeflow](./KubeflowMode.md) and [frameworkcontroller](./FrameworkControllerMode.md) built-in training services. NNI not only provides few built-in training service options, but also provides a method for customers to build their own training service easily. ## Built-in TrainingService @@ -12,6 +12,7 @@ NNI not only provides few built-in training service options, but also provides a |[__Pai__](./PaiMode.md)|NNI supports running an experiment on [OpenPAI](https://github.com/Microsoft/pai) (aka pai), called pai mode. Before starting to use NNI pai mode, you should have an account to access an [OpenPAI](https://github.com/Microsoft/pai) cluster. See [here](https://github.com/Microsoft/pai#how-to-deploy) if you don't have any OpenPAI account and want to deploy an OpenPAI cluster. In pai mode, your trial program will run in pai's container created by Docker.| |[__Kubeflow__](./KubeflowMode.md)|NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a Kubernetes cluster, either on-premises or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your Kubernetes cluster. If you are not familiar with Kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a good start. In kubeflow mode, your trial program will run as Kubeflow job in Kubernetes cluster.| |[__FrameworkController__](./FrameworkControllerMode.md)|NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install Kubeflow for specific deep learning framework like tf-operator or pytorch-operator. Now you can use FrameworkController as the training service to run NNI experiment.| +|[__AML__](./AMLMode.md)|NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) called aml mode. Before starting to use NNI aml mode, you should have an account to access an [AML](https://ml.azure.com/) cluster. | ## TrainingService Implementation diff --git a/docs/img/aml_account.png b/docs/img/aml_account.png new file mode 100644 index 0000000000000000000000000000000000000000..c036b4c047c09d8e41eeaa0646d36a8f74ba1cbe GIT binary patch literal 25247 zcmeIa2{_d2A2&SJDJ`c|QYUK&g>up%OE^S}L}h1;GUPC%NrSPUmQ#eWCi`iTEMpps zb@+7ODKaWGpHF$_V@k`bu9J2p7(m+_j#Y^dj8iHHO>5fzx)2)-{tf9E)#s( z=-9^fLhE5L*v1pb_0GazD+%DoYV8{EKQ}Yq#)H3Bc$__^3(KL3d;~9ka?&}a1A`H9 z8>Y^$2Cvs$I&S3wgKc^N{j8w4KX!n@?0-9p%YP2!a{=y#A@pT>A~)+$3gC zCw#N+etE_EXzlW1i-0kl-P`Fw@t1Eujq@uWuEu#*Ob(@ra|pB3{NjCqx!6Iu zSw@}!_ZovcRBl||IX*$J2%yyr9#q@a>RZ)W(^(qzVKUX<__v|hQO)5U6<=Ns##K!9 z4q8-9G!G_rP4#WUN?(UtZe6FjJ2)vkZEuc_b$f0?*~hU1;xe-@D||1^PVqDE3`K4* zmI-CXu(j+0a_NoAGQI@o%JR6hAE_o+Es;N+e#O1EsNyA(R!uYD_vK_JOhR`yjn#8SEJ+YfFKRcm!rEB+P z>$GU|`k&AvrFgk)etZ4qL5G$@V}q+CCZE)BlCN$Bg=7Q46l z%S_7pHr!2Nm&u*nX2(tW%1fgxoWOk@LRJ#5D@nfo_)N54@phd&_K3V(>R z3y2TNI?j?!^~;_7@BlaAy9oxPTqUo3svmjcLwU_lq}sbh26}Ka#Dk)Msn3+;qJS^& zZ%&u-!Ge$(y(5LwX}SaVMo;@b);vf{4w{i!`!Qw2JwS9f=`o-4v{?k*ZVh_6+W^jo zogU6RPcf?;nQ0gr8xw=U!fzy7z>0p!4iXcR&5+tz+5Av|uGO|bVxT0Xs5S>Bn=>{v zr1j<9wV5q2*i+&z0_@TO2c;5Yr4#j0!C87~$^sU_f)=e9GqnHg^w{Jvp#O8Lj;Q&d zRkK@JM!;NHkZy=zDE?8C!)gee;Tpk*I!fuq^i|M;=szS z9T<{-4ze_7sC@F`>~R?Eon?dtOckXh(IFfExJu886nThZi0G|9Sc_s~20U#8NAo(` z-77k@g`m6k2*RidhR4%o1e)*e$x@NaIfLj8%Zs8Hu_l^QXReIVY}`k1zyXA(-J!?( z(jHHqywP@K;9hyj=`EUnGn-HJuh7gfhxhv0n!!tL>561>0OzwAfwhYu4zFQp43YCn zea8j|sfk`WVw-^mY}5d9EKrr`IA~;kTlKs$x_e@mIoz%8=`KPI9eJWOs5&eEXYzPo z; zNzrc}Qq1=AouBSKf`;U@rDxMU+t+IHTYej#Bl70^LuY3Bb&RUW{kOI!r<4>T+R*~yIzHtD{y3s6Skfqlt@_^iC z_hu_rbSBeWk-$M$w?;9%o)PehOs(pasLmqnY7ZOFLtz&5g}lhzwTf|i+%s_iha^!nVk=nE7xa* z30ee)fWzds?=|zZO(y^4F62ExqE8&-o7Z-$h!bFM&s_x4%)p1JEr zv$hi-#lY$xihU}}F@jH~&~>cAulGAh<)2D(QUkY_JHHEvpnG+S&M0J8vy20#qG$e0 zQN(GETJ9)G)Cj~cV26xM~9zzT$qV%z`gc@}|Y}kh{Y5^^&tZ8+@W`^ddnk9o< zR7JV+(IKkjVbOZ9b+|_SU855+*$I2VAAHmSbnN4z3!KcZ&vG2F5T+cVR;~7_T4%Zm zx|UK>#YBhUoHtrp*;>PI0N?T9ya(j*-lS}7PQEqx4e^2;4ong3?cC*>&?z(o6v$S zU}LD4K>Fz11c%KQ0)pnk^7!PU1W~#U;zdr@wBx==i}AcCMu^E|i)ofMGoLHxfU-um zlK*BjlcVsrkt&gUQ9o%o(fcirqO_1*BehzE$^|Nkc}O2uic)up>p%-x^fs5coqboV z{eU#Jbq}$iWyFNmiqt4jyqV$^42=1SmC^w>>1eaWJ@|wpm1ezWzydd8{YRQDCXf00 zJo5W=0aaDiUG!@~=h-l;TxA1gx)9Ez)!NZDr#r%9TUj8U^@~k({YOjXi#u(y^7~fX z3Q%silC4w2)-@MJ)SIzv7@qru6Kf@ya7lA(!jW^4AP|@2=mfh`8lD;S?aJjnO3<2H+fw2>7{j8_*0=g$jFh2I z><>P0!uISr@kk3SL;89|nq**@ODdU5)p!tT+iwksGVU zMI)*U*WS8opq<6eF6Y$Bmx5r7^K2i9# z>0s}XLpBF7hOa*pTQK`FF?ep?IME|>#N@RHyf!w#O_G3SC~^iAkZ^RM8wg(&5?#=GA>! zUv}ipM&XLo|W9}w2`>B*Sw1pz+Bx*xn zdeVbXJPQ4)SnzW>r7fy{pj6AAZfK<#im&T4Dh_bPM}Si?W$fFe#y6UNhrZLdO&ypb zaK5!He_uFBOpo*dD|#IREKvjN=0{*DmU~0aH(>OW%^E$p_V}0ghc^Dv0DQP7XMz?> z?ib(V-P_BJqKC4{{UuipC+4wB_dihHvC~HJgCENFHj;|JHRjCLy5P|~;d0oX{t?Y0 z6O~)xq<}88L?Q~0Gi6b7oI;OK|D4VMZtk6)>VXa^Bg^QYIWG2TBmoPluhu*HGpB8+ zWE01w-1&DeL^63)gK&+U@{EdNy-)=rO;1R?ENVjA)BLuaac3{eTEr&+(>@u>Y5^}T zH=sLWxWLjdGcDc^g?8i?yG+C9B)|v!#mkyX5TegF>p3^Gr3mpw2BvUnNq{7}0%xa> z^)$~ox3D7U3P|85m>6&cCc5Mj?Z_wpKox#$UqLAEJ-L~xm3u=M_Y^+TxCUB z{Bi9rT>S9ZuY6EYWPA;ZwX7pPh~#V;O~QO6k@X{FMbi~^>z=0TAhcDwK2LB{hdKj? zraQ`f$HZz`$@17{?TwR-^>Gu&HZftGEB?C(&^=eelGe&T5@0weR|aR*>>~KD!44d( z&Jv{a5leH-D!@I*fZO#WWDk-zaVEBMeD?1$rf+D_ezk(9el^bW-xePpX|aXAhcGtO zB_3cUUz;_Bf&A*KHPtL^kgBcr)X(QO0T^eQWN1W*f`EWUjaIjHP1Y?`47kn!(W@9F zL15loFVJ{p`jbbH+T&hKpV*=PnX?cSHb)G#bFKXeSycf9R2&~E8h~Kzv=X~yw2P&F zi^W7Bw5NM`koZG?;+i$sGG^}xKxpdN*ovIy8hK3+v-8f>0bmvU1jyUmG*1;hz~~v# zNg`|Ail!p~mR9a=t3fH|v?=X610=P9)iYugVc`jk`ob!nI{buDh<^f($)&2N8A30D zbR?t(mH=&@3z*y2X-d?IEP#g`3;_?wEHdXg-RZ^viA(mC$L#tvLp z9wzsX0R?UggaGC=@1J9kq_GinpfnfNzcrSUGDR=OLWO+cgQ-o+PBu0W4`i$>JX|Ndok8CfJPP zRdT@go)KizkQ`^?d<5NB2hsJ3GtnKkgOCy=prpUq2K5L|kx1(IS#d7$4TRth8l9E} zXta74K}HTl{ZrN&z&;WN$ZhUIPdIXc+f~}J)1PsHe$oU*Lyc598O;f_qEmk*-)7nG`_R(jom5ClrV~SpE~b)p zn$UpsA*XllLek$xq0#2iI`C7bmTv0UfhOoAm0Pi9*jQVO>2Ql+zowuL>_E>**e=3` z0rL9z(^mgvK9mm+!wUA9i+%=Zyw@2m)m@oL4tUnxZA}Sp@H70Nl5jdI1QH; z54<9sbB)|t5k(&Zp4b2}nPM>xp+R>(tp9m6tYEgG<7GjLH8;{VfGv_N3^`vB4W|c{ z>Mv3(R>ST*xC3k;9(`3^!pQPS5_xoI52;~}3cLiTXb&AF0zQ(s<_h$$0O%L$Ex74v zv&)W@Za++TR6Q>Ffd>+$(A-Akrsv?0tqIjJ%CILQz-SbX&WXY!89?6Oew86`=p;Ra zozKTKR2=Z_KKk1Ih)iGJqc9x4%r%t0dhGM z;IlQna%>T+1#=8a*nS0%gm!GQa%Wc2tRJPsg&EssVG7LSbJswdTC9_XI5w8l#cGrO zh{bS(d=P=3^&{vF*T}#s)5wrjBHVqk{_k(>$VmxW?YxfuX1uzbp{ufklIxMDj5ciX za_zT1S#Y$v3idAln+-zDNx&l7-wT1EW5Rhjxs}So)Utk=%y!0$CN}h4F{HS;QTBPh z&1oVYFQ^i;VV59*GJl+NCD872qV5!lNfFM#2j~t~z%yrKsGO3cn~%aaL;f?=5ctpF ztRD3qvr03`?hrD9A4qjA2zzfBD!=mpr@5uZ0Zp9-5R_hX<+@n?m82nH63xS806Ux` z=r|$HL{1#Nz3`$GK^BHx4Z8;&_fGU>^#kUXZh#=6ITO2#5Ye_4vV?;}z?+4UQ&s{c z{t5hSmw<>5IyNbhTn+$c8o85<6~HF1TkY%IHsJ5zC?FP@D$mfK-kf*?#YiegiF(=i*vYG27)(ryy)sY2l0bM3(WAHgMFgVHCX z*-t#Lwtc{cY339RLUGLi2hK^#7X;?r)w}@g1eOj0mVIx&plptWxiue_#7mWEU{84> z%;3;NixePDxNdkh%b_t7C=RS(BjlJPPPoJxS}RkkvqD%&oQb^DY4#Ru8HTkTym?8| z1)F@2Qa?vYfQIO3q8mWo2_QpWAO*TF<%7_`0w02#TgMy_2(?smdl`(n6-(1WgcW&m zTq=P8fVKO7ez}9U|1EP`0rsEo0#+5|b`7{pNrxzFX^?g-2LUkzfT!$h5Oqb*Q&&Nn z(diu^tXGZ*0k?%s1L>4=DlU*Px5%gCRnZ%Xjq24|TYv@nf*=tKl667NK@08*g;{ko zgg#sz>(9a$LQfS~@Ki_^Pu!Mm_W>|z_}B*!2>?pp*BhGxE^Fnb2aaYZ6?9iBR~i6p zaTKnB9ROaAhiS}LEf6bEF3Fr&K-`YAaw?~S1|C*B=UeK zwJ`zbfMQ!F;Yfdjm;0p>QDKhU=THa}2afhNqi-&WD4(7DcO%zlt7&-EPkX6_F6aC2 zHd?3FAInwtRG?CVpi8yC6iHwW1u zDEi#fBwR|#_jFuwPWwNZXVDr{>n#>RSSW;jQ>E3g7L^4{<*D+cC66(M4C3AI|LNX~ zO@Gd`7abgMegNSw^7W60$y92zu~xBfcd)D{PJ2dG;<{lt*}Zy_BF$qrOPyNVZD9Gr zE!bH8zY`C+j|#$`{?_Avq*Hr86xTq=n{#e-kyf1*FxBfPePZ|wLT{`CABp&;ySxhkB!}&ubD$eJ33)Qu z*wol5!=7R~)AqhLpYPsC=w+`HKWs8jY>7`#V`4n_+ah|Imu6?Cf8!)kNpYhoMBc76woiS*`OAc4Ij}f#xN+gGFMkOf9`F7P z0P~f&%i=wj`!R>b0+||J*xkn3(^q`~s@Ik+8Nw9P;18xLM7W82iT{ODrot^Px+!=t(6Bk&_U#1Prv*EnHf_Linfc zfA?+13g(NljxYE)I@N{Cf_4Ux27vy@f7+L?M#5|X!sI}pO_=(AL^s+ZJ81)X#XiJ?jtPgb+e z{(Is2ta@c9*< z5xrVgV9zlC94Xx6FT+Z{f+AMNcofDKKkd;{9ZwaEK9y-=l2hoeR zQghR{G-Ya$6`dpR85FT@1xQyy~o; zj%znnYi6iE!o)Pmf#n%?kTj3+aEZ}h-!a@arwBSw$26F3w0d1!t~NC9}!A^fn@2GoXJs}$iRbO ziT*vLSm_0x1+t+Hz=rgtF$US$$tFE<)ZEHq?@oARsszmYQ7x_7~ZaVp}SfkdS_gWvpUPm-XMcx3s|?_#<@LlrAP3RaN>AoV*c79CqEcczmnLB~||IIV{%gL)Kf z_Y&mEbJFr{SiznEn=_)fJL%p@`bon&5onCMsH$^1UhylC@{oQcg?KOZ~$;eeErRfA12 zlUX=c*;r?LN2wRVw{xhothCgFNgX875$&4XR|iW8fzzx~?Wq^qY|qHG5>1lb6Ft$! zqeF6WI{OHG!};?BWQ~g~6qNotYo|E^{_x7#Ztl4+9?I3B+Owg*ZtzPj}tMJs7c zrr$d3S=@CwkTD$|IZL`Ig{u+6y-aa>K;Ku0a&M-z&IpbIg z9lsJ0pWlt=iHPgpo>vVT-AZ!g&jrbu7b%YSVGFal1web(lL0$wvOqi?iLjMjtjoVT%4b&TyJ$L<3Tu?xL z>axIs0CfKU6@{#XJ^k~CuumQo#9>)bHBrBM0asiqmBghu|0CE`&?;W;>wk>6@Q!Ff zY@U+O9@{ZbHo9Db#pRDD}}|- zrcbWE6#T=pfFOAK+Ou$zX3xXI0@jRC)vN*UqXks44M>t$JN8*Pu70nlA!UFVQSXRR z5hJqLWPBbr2>zgmTDs90>t;>`NafxZ(`#j-9*7__1)HeF$*pSy2XV54z zR5s=b+B60S2Rlk_2j7Bj0!G*&PeR#f+hDVWT}Rn)rF%Lxhh%n~7d#XD7Pzq6?4rz~kbDC~;Pl8vx^f6BuR*K5qBrHv zUP%FZzRUUAyLd*@>ja!Bs@pqjJKIu4(Uh0-vPM~Rby25KV&~+mo&NO8dCfK8rJ6Q9 z?l4l&B*>l*sjgJUTmK?_o9JAK!{wC9oyhu^L|Ouk{Mx!Hfv;u#4~;j0 z{h`X%MwW0PeRQiYg5n_aFQ394@VuC{J@>$~a0}FR*ZtunT7KURa8I!HMaC|gzEwne z(Pe{g+QRpG`t_?zhntbYUjpXmEJ}${m32VEP*C~mo8^(h)`x{Tx@ysMRb)PR1KxKU zAI#dOAx^JU(@3p95#s`X+-JYf6lJe~CX*f|5T4OD+6>$wvy25Baw*b%+ zFGlF}j{K9nY$FgY%O*Dh_b>b6dufEHRMrm&7S!@Fk8J^akrMy?kb5UPYrH1K30CB{ zoK@W~x^R{RS|>zfilKnWKUp+FWPP5V94loWmuiUD3iLEgIigk&mwPnDcN`0QI)V-v z!Fhe(u3OxhjOqJ77b>pu?&{o*b^=kXJp;%{~SY*oDgPY{ij@QR4XZ zGxDFupZH_j*$jrd#svyRvH_nR&9n^=<8Y%=#bHU1nTo>jJv5v zI8-{5QRtjz974pQ2=Z{hyB$zsL5%r$0qf zgdJHW_N!lq+!OtFxka3c(gjMM^44*$-Em3;4N{cr%|i7%Cxd5%n?D3zsYw#4Yhx>= zpc zszGa#jTomDmXj>rk$>@>{ZdQ6m{trbk_)n<9^sMnL!I7If^Ue`$g>B+;;i(JSB#Yn zX-a||+dOBPr)z+K$UQdk7#r-6;J6bo@em1{d+Sf`eQCl@`d+B9V@di)UE^kL)?vQO zfd|1yr3s#IWduyaVy!i)fQWJq*V(NLKF$cj+r|#U+G@B;h$`o$G(VWC@%WBo2eKojH_m@KK&X?%vNJ(#{yUS##7K`BVUX zdYoi*ibsW&ZyUJDT<39ObHnuArS(L;6M$yEAwI<(Fhlef)kZCX8+~kfkmGzVKJ`po zB%dn;GSKpmn34n#xN-zcZCFIa1+WdnwUqm3tw)NgEDt0{@pv*|V>+T6M5tPXkqp#W zBt0~3_4bASO__NbJ3ZYs>n}_2V3RLE3?1TmK)K{HK(m`mXlxsF)932S^NV!2>hfS< zWda~lD=7zhN3`O42BiomA(<@C^W5XeG53{mL3hC4;reOARbj&Q$P-}m#3=Azo5S_5 zn|hj>8I80I7PLVW8;HR2KrFpThEiO!Du8MN@oD2!&f)d)n?#FS`e8rndGC9VP_LT zhN7qIc1VU;o#XcS=*fKJ-1cv|lV24%6WXTm8za?=GuZ%q=jGwZ7p556GmKr2Oah$c zA)-Dfg{?qU1e-I6_6aWy@bC+|s@nzW8AOfj=Co5lDn$FtiX`c{&pSxJfqkx~-iF9H z`aj8fRV;a?b7(pRDmKmACfF8~BLIJyo2El~iVxwLdbnPt+pj*(I|FmWNczaZk!dKl zo{F5Sg;b-cG5o0p@L28XChY;Sh8O^X+zHjGf5$S9+T*e+lV6zUPI@QT{Nzoot|&8+ z@eOB&mWO9nvwnd>H0=0SWUlPgf7AFjfOFcC_Ex3_Zla2A_mfr^)ut@q&g*q)xTkY2GB*@vXqzP9AEyp*QjGo0kNV*V7^2xrct{nd< zRDYL-4A_GPr)oqx-{H*bqQn30baf=!)=rGp6X z_c!))ZXe;)$u}J48jks=%}zDUk|)yxC(=70T0HDtoX*%Lt|hreQb6t=C-0w=@{djK zHxSE2gqu>4Q}JzW;q8H_llrMRh1QA=wP+y5f0*N@HK4XRtFx}o<1=UCHs9=}wGs}4 zexJvg(cE#T)Yx%$ld*VBda7SHr-mZVKmO$b?k~UL_cxDd`R#Iyn&j8|Ra$#?obL!| z;-20=R59B*ST?3tu6JC|R4==?R9iox_sEw4zLitI7a%G+dhZ<1VHcGTHx9lOKIEzG zGc(@d(V$s6IWv~}vFoxiB}3b<=Y?*Mu7ml03{&lP$Su+6+0)f}%{0%qwHfPiA({A* z#*m(J3f>x*ce2z-3K@Tix^wbvisXbys@#No< zf*|KMQqDBjWKYzdE%7GKG02Zu%@`jAR4O9In)rpLIEc^z&ykCZ)u zL~k!YsKR_GF(VjZp<69kvnY9RJwiWf-t|G5nS0FGgan_+wsrHl6E=CP?)Km%sU{w^ z-wsAba*ux%@m$_S!Xff&9RlL^AFyeU9w8onO&hHRS`?wbpiT{?jpni%tHB(LLe!%A zw82`d-3u9s4Nc1gSx{jHoA{_!Z<@F5e=*cP`(2FCd~p|BiUi!7U7})R4{E)w2A0Ul_o{{j=)x^y%wkIsu)*gAL-< z1AP2Fqtkr4lX-lS@=B9U9lXvfZv7!I(0SE1ZMl22H_l73{7d)XG3|*nvkK7s%V9}^ z*WlY$3^uRg;iE^7QtoZf5a#@)X4HQO5r`bC5|()sFg-%|B6J08E@pd7(gr#P+W9?L zB%9>l4G_Kqqz;rec5|!Xbd7NihcUy^(NhYX>c@Ez$|w3GED!;XW3}>xE1dDhlZpcu z%f0iWPuGM2KYR#LMDzvCH|l%+2aB1WU4ea*r9*+U;Ftf9Z)tjJfX~=~d!X+a-rXNH z24v$g<`p>RRq8$!*!7|7GN7-PVxZ0U{I2T}#Sb*%OfK8acm-_f7|)NiJuTe2rI|dI zGDVopdRr>H4`vZ5YEn}=RNmg3?A5vRzD9=EP-jQk^k{oWcxK3#T`gtDZ*R*Q4*3xB zPOFQw;c!G7kYC48fY%uC3nT8u&?E+~I%|gTo9^29gA*0Z^vW55Bkh@fo2Jg+bS>x!ns8rWq{7osC90!hJPrV zT=x`@4BLv*S6QE3Ia(d^lw)QEgVjNuh~E+i>R}Wy-&d;-A6A6LfhSfBC<^j=;EoJn zPiyG74OfScRC+!a^0BA*oYO!L_?`n(UXs-;(BGrIj@L5Df0f+nR?Xtp)u3b?nKI7I zUE$12;;?tW{;=4LIif*`+vNy8rT8AZGDu`$s*|3uS9d$zlqA|vb)8nENp;b;YCiRM@QYOT+Ak!oAiVJ#Rt9dB-5{h+YzZhpIcOUX{k^}UUYvUDw2-;}L5w|y-{ z;yAImS_=in2QbOM7Ho&*^q6&a^-d$SR(X?PPftAg+HLZ!W(WNAD)gzW?NwFw?4&eh z@n9AxrECDR87(b-DzXk9t!*x0VWJIBP;RhY3o>P(M|5k?NC?=_f@TGD8KP|ubZN|Z zNYVZr(M0jYywLg;PaiBGmtZpMDp_l{fP@WY+ksqvftBj#2<+8VRBJ zrV?d$+yMD(*s9-+mUF$7ZSn^3zl7C8jTJwM;=mvY$ciRQClaVoj|0eDjj-`@&Qrcr608FMMrgI0KiimItqIez!>hWN1{kE$T>EEWJg25#MeFm5Js%BS2sL zIu2+pd-^`KEYPnPRoDoZ{aROK2J!Gt9#ovTm*1nzP+xuLKS=a62%_uh(odG24O5nY z)ASS*m%aHGNAz$oivA~oPxaWN+AI+ZtF22lrw6mYs|pZ;VcV#3qBTXYXOH!b(#j(xhR<-D`UZW5v6@#Id#p=RhqaNdNn$YO?#J zOIVuFj&3*{$sB-_sP{=_u3&m%R4FPPAf1oRheMoCa@?C}l^Jdubk4n<|Pg|EtAkh_>ee>_?H_=D`y zVH=bBy3Z}}p1b3&Wv@%b9N8r|ToB=uKG^89@=;LXKd-r@)e+R!d35(pbBU|lU!sJZ zlm9R>kazry}%RsCCH(J%_x?FPpr{;jC$JW6b9x zbO){pMPc&Q25To&>|VN53o(O>KX6ic)31m2VM#k7{%mu(L7VGQclfXCGlV+o&{4~4S3$tqJWc=IJV4mvwBd|6CAl9_a^|i zpGtm3y5?bm-})TsW*}jd^w;A4BA)Be2OUP2fXxKW_k8l~z7RCJUK08r>^o|D8F}4vqTeKUR9{i5-vetMaQooWdz~WuLR{{W@ z7ZxFu`%TshvgSPf0-beByQo1E!}1c88S>?Pcj>aaJ}>G+@?PpL z!Et}5d1ZNZf3eywDinW>p9?DJ-;R8F((OBa4&MO5f5rVZA@IMLxXZe;?4~p8ge)1z zl|c+J>9(ZHgdV-Qcd*gX-3`YF>zkWm_|~2G4e7G}@1zomaBFBo~p1^&dtvz$bO$V-}aTyiwtgq@U3s9N#P?>k*RssfwM zgX)$vZT~Cjqn?QKfe{u#bqmVdb2hnYHFkB;;s2Q)rbXRuybb>w4E3E8S#cq4tJzQ6 zX-gWP7CHo{UikG75t9cz!(TZoS#!6Sn$w+-9tS-AJ-W6(#Vlxh;VDIlLY&+YWaA&8 z;EEqVj;fDgwcPZc&4gvISb9*7#d_-+CzCqHXf1C%6p&mcx_Y1;d&G?HAJV?MXw!F^ zSav(uZ>5-?dlbF_y+4P@^~|w$`lk(10LJEhch*$#?@aUwPfMVGrj1Y96WrMa%CI=Y z#kIq4+oFeo>f2d+ik$P#==i;Kt7fFfRnSj=9nAQ4ZbeB8yN8|9q+G7Xh`rFFYmT|C zQdrpNHLo@?hLH)#syyXpvIwqTKRmvHwgJyNQs^>Dm!wvQ$Ps z`ZCgJ!+sar0+kox7WA|apWhs)E9Y9bGmfit3V~hpwv78bci7dvE!_Czx8*_2XRFfQ z3_Zu%FZWQn;}C@GTaC{n)yA;4upv=Ws6!`NGuD0OZb4~_o1^|^7A5zR>3a^uJ~rG* zpo-f>Cm*CF6d`UQAsFdG?sGq6F*bwa>M*7QG@;S{%( zP!}dZyWd&@7ifKCGF@6vsdEj`nPm#?^AZEl>(j<%k@0}c9i{YO2am~6@I4z>X{{o# zr&`Nl73;*~hflViI3HqIH3BTMinbDFn6|i0{@?IW<%M8;*%lmw7M9LQMQ(zf6JHt- z2#>E{63^H+YNbtqp0{jKXV$&fI=h#h7Z9!n3-bRSAlRk*%L50bfr=9)7yVLfgL-P1 zjr~U0ud$2krvK7PVMupcRxaiKps4V_0In47g$66uz#f?_YRg;%=2Zpc&gh?8@o1^1 zDme_27xCgd=>8Ry-GYCvz6%}yLYKk%7~JS(yVaXLkP=OR zlAzXd46(&>QF4{nP9-n*EQAx=9I1Mz*@ULojs2{eaxg6%xkbZHR31N$jF%(;j~B6Y zk3Ra+HaCx3i=diyS?V{4EN9#F*O>=1+pDQ=_2S?w$1WZIP8d3^3W1MDH2s8^0(&k~l_6gK)=xH(psW-{l@ASyy>~evZi>&CgpB z`Ymg!uDkfz0sO-N^!n>06pvo}$Lw*YywT0tZcUc}4V9$M#!N7e_SrkTz?3m5sq1nO z9M1CH-Ke#YvG&YSCXmvU6+@Ge%&ICWv_eW;vQ!Jk^AsF3Cz3obUIfp3vD^|YK(F7@ z$*O-7i5DH1g8Mf_HkrfTompywfc7mMd;A(*zW0b_J>LJ?)v|VBUVillEPpBL>F{Y_ zbxl>2c2tZfcMN*XJm?tIp4`DN8F$@v566AhZZ^HNV|F}nSaE&O#~&%BW_mLCzKpK% zxN$ypiNv~Vv)+?kGjTI7SNyuLn;5KWzX4*lIWWpENFeupb>OQN>?I+42N)_oG8s7A z)G^rEFtILZ@PwE71R2ufxF8Zkj48AUGJp5Nc%o6VU zCyw6xp8mPi%dLd(B5!~9`?7P{zZtOs>V99`fOfH%8O!e)BoE!`N3sCC{N2<$5{z#t znOT47&rjAttCp(k>Z0H0Zntb!#$LPyg4Hu?-@UT@LV~e=OuD=fkrt{p+XMDJ^;w#< z!0RC1yIMh)FZHpcHP{<@j_>I!vu`eMd$(n;n%s)YJ+V-+=%5O|J@O46%X|hb-C)5hsQguhwORO|W zmSWH)^wP=oCj$>7=@?O*o8osj<9}=I?btishxPxcqE)Ju8HJt2=l$WPNTKcZv`rEi zjjsdlOzo;@9s|TsvS<@pLovTkuynr-3U{Muv`y_Zcs?j_Ei3g5uWDZ(QkEpQ56NP~ z$woG~`gH~29=hvZo^+cj_t~c#OMkwSu-a1gN9gfT zJ~#7mc8j5-D?Yi7X2!jISY?OW6*q?*k9~Mnt5-m(hm;8c<=R0az6Q%$N*Yn&YeO!G zSJB#wU2RpgF|W?n7t+?BZQp}VjtJY^3}^X_CU-I4R=;6IG47(Ao`FN$u$28I9NC?w zE*tK|0$%}U4CPKIvjs=h4xOzhTTRdufc>F&rq++w!=ojcSbDwKoos;N?S#|ZT7i{KyKRgz)6_z(&S$M z9bd%km#ztZH{F61@a3g!I}upZ;^F&$0jrOZ2H|<&SXTnSyQf`^Irm-HOyG##?0wi> zsKB%U!|afS27-QFdL{t8-CL4|d9!Z}%3PlL4<2&yMAHAxnM>Biu~c7NiuW*onbuQqKLEbh}U!&)g# z{$XSK{m7v%EoLE2OoKoDJ97`lx+C{|_KO#c24w5N_P;;y!T)gjWI{*W{8i!gb#IAd zg5l{uVg?}J#o$AUWc8(I6D!;9vnW*OWI~+K$s~*XaNszb$*1d`-I&9w;a1&bM^_`F zhtJ{O!?!gX>T1*t?rj%-eI8}%bmR3P_&1TE49L9t=J;C;2q{9O7R z8avLq)<(N5f`UqDBfoqGxw0wmX}t9xc(sS--AP+1Rn&}d$}go__b{51*@YghC8_{x z5mnuHYS6_t;H_(_6h5n0{b3u(@>A7Zs(k#x;@QZNH)?8P8t$P)R(5rtHB-#jD9Jrp zp6x-6VcpKBI@=m25opF%PT(7ObPJ9~stKR;OcAZ?yWs+lzbtByTD+*?W=wNt>z-EYIwJRy&yQTe#Xp#4ulechBed`^|Fur^$T0nGxM%s32FzN# z2ToHq(IT^0RcmBI!NmQL&7l$aNs&~aGEdijH9;hbzt~M89M{~8*+Zc&m;1W;$FQZ;XTrm@{HZ$)u9_BQQaf@k5W(o&hKKNZCT`^3!y~J zXP;bksUP3mKc;_`O87}PWH2crOrJf{I?{~RV-8Oh6{yp52-J+e)f!IBJd!S4nBI!j zhzoP9^pI>GH_0gs*-Gh#|D~C3Efrc1(zzzdiTFY!+Aq`_vAqbf6yt&V{87~#BD+NX z1N2$rg;3lHfiQ0lT0TGcukT$pL&>w4 zAG9tk^|D|BYWep$EQDd8q_=RI^lO=EDF6cr3D4!x|2s3K|HA42M|@+h>te}};(x{c z{}sLauekrK-OKx+{_9M>jl+KhpZ`bsZbd>x=k)C9(liyR-$EupPb=(%zL8#z?zyY~ E7nQPc^Z)<= literal 0 HcmV?d00001 diff --git a/examples/trials/mnist-tfv1/config_aml.yml b/examples/trials/mnist-tfv1/config_aml.yml new file mode 100644 index 0000000000..98af856430 --- /dev/null +++ b/examples/trials/mnist-tfv1/config_aml.yml @@ -0,0 +1,25 @@ +authorName: default +experimentName: example_mnist +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +trainingServicePlatform: aml +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + computerTarget: ussc40rscl + nodeCount: 1 +amlConfig: + subscriptionId: ${replace_to_your_subscriptionId} + resourceGroup: ${replace_to_your_resourceGroup} + workspaceName: ${replace_to_your_workspaceName} diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index 7b0a4a8063..8b8872471d 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -18,7 +18,8 @@ parser.add_argument('--docker_image', help='the docker image of job') parser.add_argument('--experiment_name', help='the experiment name') parser.add_argument('--code_dir', help='code directory') - parser.add_argument('--script', help='script') + parser.add_argument('--script', help='script name') + parser.add_argument('--node_count', help='the nodeCount of a run in aml') args = parser.parse_args() ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) @@ -29,7 +30,7 @@ run_config.environment.docker.enabled = True run_config.environment.docker.base_image = args.docker_image run_config.target = compute_target - run_config.node_count = 1 + run_config.node_count = args.node_count config = ScriptRunConfig(source_directory=args.code_dir, script=args.script, run_config=run_config) run = experiment.submit(config) print(run.get_details()["runId"]) diff --git a/src/nni_manager/training_service/common/containerJobData.ts b/src/nni_manager/training_service/common/containerJobData.ts index 17a482ae33..f7a29f384a 100644 --- a/src/nni_manager/training_service/common/containerJobData.ts +++ b/src/nni_manager/training_service/common/containerJobData.ts @@ -12,15 +12,3 @@ else # Install nni python3 -m pip install --user --upgrade nni fi`; - -export const AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT: string = -`#!/bin/bash -if python3 -c 'import nni' > /dev/null 2>&1; then - # nni module is already installed, skip - return -else - # Install nni - python3 -m pip install --user --no-cache-dir -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nni==1.63473 -fi`; - - diff --git a/src/nni_manager/training_service/reusable/aml/amlData.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts similarity index 96% rename from src/nni_manager/training_service/reusable/aml/amlData.ts rename to src/nni_manager/training_service/reusable/aml/amlClient.ts index 6a83a22697..cf5921f0b1 100644 --- a/src/nni_manager/training_service/reusable/aml/amlData.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -34,6 +34,7 @@ export class AMLClient { public scriptName: string; public pythonShellClient: undefined | PythonShell; public codeDir: string; + public nodeCount: number; public computerTarget: string; private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?.*?)'`; @@ -43,6 +44,7 @@ export class AMLClient { workspaceName: string, experimentId: string, computerTarget: string, + nodeCount: number, image: string, scriptName: string, codeDir: string, @@ -52,6 +54,7 @@ export class AMLClient { this.workspaceName = workspaceName; this.experimentId = experimentId; this.image = image; + this.nodeCount = nodeCount; this.scriptName = scriptName; this.codeDir = codeDir; this.computerTarget = computerTarget; @@ -70,7 +73,8 @@ export class AMLClient { '--docker_image', this.image, '--experiment_name', `nni_exp_${this.experimentId}`, '--code_dir', this.codeDir, - '--script', this.scriptName + '--script', this.scriptName, + '--node_count', this.nodeCount.toString() ] }); this.pythonShellClient.on('message', function (envId: any) { diff --git a/src/nni_manager/training_service/aml/amlConfig.ts b/src/nni_manager/training_service/reusable/aml/amlConfig.ts similarity index 95% rename from src/nni_manager/training_service/aml/amlConfig.ts rename to src/nni_manager/training_service/reusable/aml/amlConfig.ts index 3c85c554b6..0bed1b135c 100644 --- a/src/nni_manager/training_service/aml/amlConfig.ts +++ b/src/nni_manager/training_service/reusable/aml/amlConfig.ts @@ -3,8 +3,8 @@ 'use strict'; -import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../common/trainingService'; -import {TrialConfig} from '../common/trialConfig'; +import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; +import {TrialConfig} from '../../common/trialConfig'; export class AMLClusterConfig { public readonly subscriptionId: string; diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 337768c92a..5ae063e841 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -26,9 +26,9 @@ import * as component from '../../../common/component'; import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../../common/log'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; -import { AMLClusterConfig, AMLTrialConfig, AMLTrialJobDetail } from '../../aml/amlConfig'; +import { AMLClusterConfig, AMLTrialConfig, AMLTrialJobDetail } from '../aml/amlConfig'; import { EnvironmentInformation, EnvironmentService } from '../environment'; -import { AMLClient } from '../aml/amlData'; +import { AMLClient } from '../aml/amlClient'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetric @@ -143,6 +143,7 @@ export class AMLEnvironmentService implements EnvironmentService { this.amlClusterConfig.workspaceName, this.experimentId, this.amlTrialConfig.computerTarget, + this.amlTrialConfig.nodeCount, this.amlTrialConfig.image, 'nni_script.py', environment.environmentLocalTempFolder diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 5429a69381..e6dc57bb6e 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -26,7 +26,7 @@ import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetri import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; -import { AMLClusterConfig } from '../aml/amlConfig'; +import { AMLClusterConfig } from './aml/amlConfig'; import { PAIK8STrainingService } from '../pai/paiK8S/paiK8STrainingService'; import { EnvironmentService } from './environment'; import { OpenPaiEnvironmentService } from './environments/openPaiEnvironmentService'; diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 9e3f37ca7d..215b5ed147 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -29,7 +29,7 @@ import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobM import { delay, getLogLevel, getVersion, uniqueString, getExperimentRootDir } from '../../common/utils'; import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, SEND_TRIAL_JOB_PARAMETER, TRIAL_END } from '../../core/commands'; import { GPUSummary } from '../../training_service/common/gpuData'; -import { CONTAINER_INSTALL_NNI_SHELL_FORMAT, AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; +import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { validateCodeDir, execMkdir, execCopydir, tarAdd } from '../common/util'; @@ -464,7 +464,7 @@ class TrialDispatcher implements TrainingService { this.runnerSettings.command = this.trialConfig.command; await fs.promises.writeFile(runnerSettingsPath, JSON.stringify(this.runnerSettings), { encoding: 'utf8' }); const installFilePath = path.join(environmentLocalTempFolder, "install_nni.sh"); - await fs.promises.writeFile(installFilePath, AML_CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); + await fs.promises.writeFile(installFilePath, { encoding: 'utf8' }); environment.command = `import os\nos.system('sh install_nni.sh && mkdir ${this.experimentId} && cd ${this.experimentId} && python3 -m nni_trial_tool.trial_runner')`; environment.environmentLocalTempFolder = environmentLocalTempFolder; await tarAdd(path.join(environmentLocalTempFolder, 'nni-code.tar.gz'), this.trialConfig.codeDir); From 586d6ac85c8593fad84ff085c5a1beeff8cb1162 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 13:55:15 +0800 Subject: [PATCH 48/98] support log level in UT --- src/nni_manager/common/utils.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/nni_manager/common/utils.ts b/src/nni_manager/common/utils.ts index ddd8a26345..7150ca19f9 100644 --- a/src/nni_manager/common/utils.ts +++ b/src/nni_manager/common/utils.ts @@ -19,6 +19,7 @@ import { Database, DataStore } from './datastore'; import { ExperimentStartupInfo, getExperimentStartupInfo, setExperimentStartupInfo } from './experimentStartupInfo'; import { ExperimentParams, Manager } from './manager'; import { HyperParameters, TrainingService, TrialJobStatus } from './trainingService'; +import { logLevelNameMap } from './log'; function getExperimentRootDir(): string { return getExperimentStartupInfo() @@ -184,7 +185,12 @@ function prepareUnitTest(): void { Container.snapshot(TrainingService); Container.snapshot(Manager); - setExperimentStartupInfo(true, 'unittest', 8080, 'unittest'); + const logLevel: string = parseArg(['--log_level', '-ll']); + if (logLevel.length > 0 && !logLevelNameMap.has(logLevel)) { + console.log(`FATAL: invalid log_level: ${logLevel}`); + } + + setExperimentStartupInfo(true, 'unittest', 8080, 'unittest', undefined, logLevel); mkDirPSync(getLogDir()); const sqliteFile: string = path.join(getDefaultDatabaseDir(), 'nni.sqlite'); From 2db8ff856eaa60a0ca84ede1c7c36a32880eed28 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 15:51:33 +0800 Subject: [PATCH 49/98] refine interface to support aml better. --- .../reusable/commandChannel.ts | 2 +- .../training_service/reusable/environment.ts | 11 +++++ .../environments/openPaiEnvironmentService.ts | 4 +- .../reusable/trialDispatcher.ts | 47 +++++++++++++++---- 4 files changed, 53 insertions(+), 11 deletions(-) diff --git a/src/nni_manager/training_service/reusable/commandChannel.ts b/src/nni_manager/training_service/reusable/commandChannel.ts index a20b6a1b38..bd8232c6d0 100644 --- a/src/nni_manager/training_service/reusable/commandChannel.ts +++ b/src/nni_manager/training_service/reusable/commandChannel.ts @@ -26,7 +26,7 @@ export class Command { } } -export abstract class RunnerConnection { +export class RunnerConnection { public readonly environment: EnvironmentInformation; constructor(environment: EnvironmentInformation) { diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 252c805d8a..0a950e32bc 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -6,6 +6,9 @@ import { GPUSummary } from "training_service/common/gpuData"; import { getLogger, Logger } from "../../common/log"; import { TrialJobStatus } from "../../common/trainingService"; +import { EventEmitter } from "events"; +import { WebCommandChannel } from "./channels/webCommandChannel"; +import { CommandChannel } from "./commandChannel"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; @@ -19,6 +22,14 @@ export abstract class EnvironmentService { public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; public abstract startEnvironment(environment: EnvironmentInformation): Promise; public abstract stopEnvironment(environment: EnvironmentInformation): Promise; + + public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { + return new WebCommandChannel(commandEmitter); + } + + public createEnviornmentInfomation(envId: string, envName: string): EnvironmentInformation { + return new EnvironmentInformation(envId, envName); + } } export class NodeInfomation { diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index df6a8a825c..ba8a3f4473 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -21,7 +21,7 @@ const yaml = require('js-yaml'); * Collector PAI jobs info from PAI cluster, and update pai job status locally */ @component.Singleton -export class OpenPaiEnvironmentService implements EnvironmentService { +export class OpenPaiEnvironmentService extends EnvironmentService { private readonly log: Logger = getLogger(); private paiClusterConfig: PAIClusterConfig | undefined; @@ -35,11 +35,11 @@ export class OpenPaiEnvironmentService implements EnvironmentService { private experimentId: string; constructor() { + super(); this.paiTokenUpdateInterval = 7200000; //2hours this.experimentId = getExperimentId(); } - public get hasStorageService(): boolean { return true; } diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 2bf4bc4329..25c0137e31 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -43,11 +43,11 @@ class TrialDispatcher implements TrainingService { private trialConfig: TrialConfig | undefined; private runnerSettings: RunnerSettings; - private commandEmitter: EventEmitter; + private commandEmitter: EventEmitter | undefined; + private commandChannel: CommandChannel | undefined; private readonly trials: Map; private readonly environments: Map; - private readonly commandChannel: CommandChannel; constructor() { this.log = getLogger(); @@ -60,9 +60,6 @@ class TrialDispatcher implements TrainingService { this.runnerSettings.experimentId = this.experimentId; this.runnerSettings.platform = getPlatform(); - this.commandEmitter = new EventEmitter(); - this.commandChannel = new WebCommandChannel(this.commandEmitter); - const logLevel = getLogLevel(); this.log.debug(`current folder ${__dirname}`); // different source folder in Linux and Windows @@ -119,6 +116,9 @@ class TrialDispatcher implements TrainingService { if (environment === undefined) { throw new Error(`TrialDispatcher: trial ${trialJobId}'s env shouldn't be undefined in updateTrialJob.`); } + if (this.commandChannel === undefined) { + throw new Error(`TrialDispatcher: commandChannel shouldn't be undefined in updateTrialJob.`); + } const message = { "trialId": trialJobId, @@ -130,6 +130,9 @@ class TrialDispatcher implements TrainingService { } public async cancelTrialJob(trialJobId: string, isEarlyStopped?: boolean | undefined): Promise { + if (this.commandChannel === undefined) { + throw new Error(`TrialDispatcher: commandChannel shouldn't be undefined in cancelTrialJob.`); + } const trial = await this.getTrialJob(trialJobId); switch (trial.status) { case "RUNNING": @@ -150,6 +153,11 @@ class TrialDispatcher implements TrainingService { } public async run(): Promise { + const environmentService = component.get(EnvironmentService); + + this.commandEmitter = new EventEmitter(); + this.commandChannel = new WebCommandChannel(this.commandEmitter); + await this.jobRestServer.start(); this.jobRestServer.setEnableVersionCheck = this.versionCheck; @@ -157,8 +165,11 @@ class TrialDispatcher implements TrainingService { this.runnerSettings.nniManagerPort = this.jobRestServer.clusterRestServerPort; this.runnerSettings.commandChannel = this.commandChannel.channelName; + // for AML channel, other channels can ignore this. + this.commandChannel.config("MetricEmitter", this.metricsEmitter); // for restful api, other channel can ignore this. this.commandChannel.config("RestServer", this.jobRestServer.Server); + // start channel this.commandEmitter.on("command", (command: Command): void => { this.handleCommand(command).catch((err: Error) => { @@ -172,7 +183,6 @@ class TrialDispatcher implements TrainingService { throw new Error(`trial config shouldn't be undefined in run()`); } - const environmentService = component.get(EnvironmentService); if (environmentService.hasStorageService) { this.log.info(`TrialDispatcher: copying code and settings.`); const storageService = component.get(StorageService); @@ -246,6 +256,12 @@ class TrialDispatcher implements TrainingService { } public async cleanUp(): Promise { + if (this.commandChannel === undefined) { + throw new Error(`TrialDispatcher: commandChannel shouldn't be undefined in cleanUp.`); + } + if (this.commandEmitter === undefined) { + throw new Error(`TrialDispatcher: commandEmitter shouldn't be undefined in cleanUp.`); + } this.stopping = true; const environmentService = component.get(EnvironmentService); const environments = [...this.environments.values()]; @@ -272,6 +288,9 @@ class TrialDispatcher implements TrainingService { } private async environmentMaintenanceLoop(): Promise { + if (this.commandChannel === undefined) { + throw new Error(`TrialDispatcher: commandChannel shouldn't be undefined in environmentMaintenanceLoop.`); + } const environmentService = component.get(EnvironmentService); while (!this.stopping) { const environments: EnvironmentInformation[] = []; @@ -305,6 +324,10 @@ class TrialDispatcher implements TrainingService { } private async trialManagementLoop(): Promise { + if (this.commandChannel === undefined) { + throw new Error(`TrialDispatcher: commandChannel shouldn't be undefined in trialManagementLoop.`); + } + while (!this.stopping) { await delay(2000); @@ -414,10 +437,14 @@ class TrialDispatcher implements TrainingService { } private async requestEnvironment(): Promise { + if (this.commandChannel === undefined) { + throw new Error(`TrialDispatcher: commandChannel shouldn't be undefined in requestEnvironment.`); + } + const environmentService = component.get(EnvironmentService); const envId = uniqueString(5); - const name = `nni_exp_${this.experimentId}_env_${envId}`; - const environment = new EnvironmentInformation(envId, name); + const envName = `nni_exp_${this.experimentId}_env_${envId}`; + const environment = environmentService.createEnviornmentInfomation(envId, envName); environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; @@ -448,6 +475,10 @@ class TrialDispatcher implements TrainingService { } private async assignEnvironment(trial: TrialDetail, environment: EnvironmentInformation): Promise { + if (this.commandChannel === undefined) { + throw new Error(`TrialDispatcher: commandChannel shouldn't be undefined in assignEnvironment.`); + } + if (trial.environment) { throw new Error(`trial ${trial.id} has assigned environment ${trial.environment.id} already, not assign to ${environment.id}!`); } From f631e4c537e9476959e2ff328feadf2e46fce2b8 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 15:51:51 +0800 Subject: [PATCH 50/98] fix runtime error on exit --- tools/nni_trial_tool/web_channel.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/nni_trial_tool/web_channel.py b/tools/nni_trial_tool/web_channel.py index d386f47b70..752a303cb0 100644 --- a/tools/nni_trial_tool/web_channel.py +++ b/tools/nni_trial_tool/web_channel.py @@ -32,8 +32,9 @@ def _inner_open(self): def _inner_close(self): if self.client is not None: - self._event_loop.run_until_complete(self.client.close()) - self._event_loop.close() + self.client.close() + if self._event_loop.is_running(): + self._event_loop.close() self.client = None self._event_loop = None From f687a6e2b8ad2c4de3653f0d0f999aa1c77bbe45 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 16:23:18 +0800 Subject: [PATCH 51/98] fix eslint error --- .../training_service/reusable/environment.ts | 84 +++++++++---------- 1 file changed, 42 insertions(+), 42 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 0a950e32bc..14db7a661d 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -14,48 +14,6 @@ import { CommandChannel } from "./commandChannel"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; export type Channel = "web" | "file" | "aml" | "ut"; -export abstract class EnvironmentService { - - public abstract get hasStorageService(): boolean; - - public abstract config(key: string, value: string): Promise; - public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; - public abstract startEnvironment(environment: EnvironmentInformation): Promise; - public abstract stopEnvironment(environment: EnvironmentInformation): Promise; - - public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { - return new WebCommandChannel(commandEmitter); - } - - public createEnviornmentInfomation(envId: string, envName: string): EnvironmentInformation { - return new EnvironmentInformation(envId, envName); - } -} - -export class NodeInfomation { - public id: string; - public status: TrialJobStatus = "UNKNOWN"; - public endTime?: number; - - constructor(id: string) { - this.id = id; - } -} - -export class RunnerSettings { - public experimentId: string = ""; - public platform: string = ""; - public nniManagerIP: string = ""; - public nniManagerPort: number = 8081; - public nniManagerVersion: string = ""; - public logCollection: string = "none"; - public command: string = ""; - public enableGpuCollector: boolean = false; - - // specify which communication channel is used by runner. - // supported channel includes: rest, storage, aml - public commandChannel: Channel = "file"; -} export class EnvironmentInformation { private log: Logger; @@ -107,3 +65,45 @@ export class EnvironmentInformation { } } } +export abstract class EnvironmentService { + + public abstract get hasStorageService(): boolean; + + public abstract config(key: string, value: string): Promise; + public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; + public abstract startEnvironment(environment: EnvironmentInformation): Promise; + public abstract stopEnvironment(environment: EnvironmentInformation): Promise; + + public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { + return new WebCommandChannel(commandEmitter); + } + + public createEnviornmentInfomation(envId: string, envName: string): EnvironmentInformation { + return new EnvironmentInformation(envId, envName); + } +} + +export class NodeInfomation { + public id: string; + public status: TrialJobStatus = "UNKNOWN"; + public endTime?: number; + + constructor(id: string) { + this.id = id; + } +} + +export class RunnerSettings { + public experimentId: string = ""; + public platform: string = ""; + public nniManagerIP: string = ""; + public nniManagerPort: number = 8081; + public nniManagerVersion: string = ""; + public logCollection: string = "none"; + public command: string = ""; + public enableGpuCollector: boolean = false; + + // specify which communication channel is used by runner. + // supported channel includes: rest, storage, aml + public commandChannel: Channel = "file"; +} From 476ffecfeaacad3edcd5646d379ed942fa1dd354 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 18:24:53 +0800 Subject: [PATCH 52/98] send metric data from channel --- src/nni_manager/core/commands.ts | 3 + .../reusable/channels/webCommandChannel.ts | 25 +++--- .../reusable/commandChannel.ts | 20 +++-- .../reusable/jobRestServer.ts | 78 ---------------- .../reusable/trialDispatcher.ts | 89 ++++++++++++++----- tools/nni_trial_tool/base_channel.py | 26 +----- tools/nni_trial_tool/commands.py | 24 +++++ tools/nni_trial_tool/log_utils.py | 15 +++- tools/nni_trial_tool/trial.py | 4 +- tools/nni_trial_tool/trial_runner.py | 27 +++--- 10 files changed, 144 insertions(+), 167 deletions(-) delete mode 100644 src/nni_manager/training_service/reusable/jobRestServer.ts create mode 100644 tools/nni_trial_tool/commands.py diff --git a/src/nni_manager/core/commands.ts b/src/nni_manager/core/commands.ts index 575f492f14..93f69cdb5e 100644 --- a/src/nni_manager/core/commands.ts +++ b/src/nni_manager/core/commands.ts @@ -13,6 +13,7 @@ const TERMINATE = 'TE'; const PING = 'PI'; const GPU_INFO = 'GI'; +const STDOUT = 'SO'; const INITIALIZED = 'ID'; const NEW_TRIAL_JOB = 'TR'; @@ -30,6 +31,7 @@ const TRIAL_COMMANDS: Set = new Set([ INITIALIZED, TRIAL_END, GPU_INFO, + STDOUT, ]); const TUNER_COMMANDS: Set = new Set([ @@ -68,6 +70,7 @@ export { TERMINATE, PING, GPU_INFO, + STDOUT, INITIALIZED, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, diff --git a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts index d380d75a64..774b1a8ffc 100644 --- a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts @@ -3,9 +3,8 @@ 'use strict'; -import { Server as HttpServer } from 'http'; import { Server as SocketServer } from "ws"; -import { getExperimentId } from "../../../common/experimentStartupInfo"; +import { getBasePort, getExperimentId } from "../../../common/experimentStartupInfo"; import { INITIALIZED } from '../../../core/commands'; import { CommandChannel, RunnerConnection } from "../commandChannel"; import { Channel, EnvironmentInformation } from "../environment"; @@ -31,7 +30,6 @@ class WebRunnerConnection extends RunnerConnection { export class WebCommandChannel extends CommandChannel { private readonly expId: string = getExperimentId(); - private httpServer: HttpServer | undefined; private webSocketServer: SocketServer | undefined; private clients: Map = new Map(); @@ -39,29 +37,26 @@ export class WebCommandChannel extends CommandChannel { return "web"; } - public async config(key: string, value: any): Promise { - switch (key) { - case "RestServer": - this.httpServer = value as HttpServer; - break; - } + public async config(_key: string, _value: any): Promise { + // do nothing } public async start(): Promise { - if (this.httpServer === undefined) { - throw new Error(`http server is not initialized!`); - } - - const server = this.httpServer; - this.webSocketServer = new SocketServer({ server }); + const port = getBasePort() + 1; + this.webSocketServer = new SocketServer({ port }); this.webSocketServer.on('connection', (client: WebSocket) => { this.log.debug(`WebCommandChannel: received connection`); + client.onerror = (event): void => { + this.log.error(`error on client ${JSON.stringify(event)}`); + } this.clients.set(client, undefined); client.onmessage = (message): void => { this.receivedWebSocketMessage(client, message); }; + }).on('error', (error) => { + this.log.error(`error on websocket server ${error}`); }); } diff --git a/src/nni_manager/training_service/reusable/commandChannel.ts b/src/nni_manager/training_service/reusable/commandChannel.ts index bd8232c6d0..80b1ddc5b0 100644 --- a/src/nni_manager/training_service/reusable/commandChannel.ts +++ b/src/nni_manager/training_service/reusable/commandChannel.ts @@ -4,10 +4,10 @@ 'use strict'; import { EventEmitter } from "events"; +import { getLogger, Logger } from "../../common/log"; import { TRIAL_COMMANDS } from "../../core/commands"; import { encodeCommand } from "../../core/ipcInterface"; -import { EnvironmentInformation, Channel } from "./environment"; -import { Logger, getLogger } from "../../common/log"; +import { Channel, EnvironmentInformation } from "./environment"; const acceptedCommands: Set = new Set(TRIAL_COMMANDS); @@ -96,14 +96,18 @@ export abstract class CommandChannel { if (undefined !== matches.groups) { const commandType = matches.groups["type"]; const dataLength = parseInt(matches.groups["length"]); - let data: any = matches.groups["data"]; + const data: any = matches.groups["data"]; if (dataLength !== data.length) { throw new Error(`dataLength ${dataLength} not equal to actual length ${data.length}: ${data}`); } - // to handle encode('utf8') of Python - data = JSON.parse('"' + data.split('"').join('\\"') + '"'); - const finalData = JSON.parse(data); - commands.push([commandType, finalData]); + try { + const finalData = JSON.parse(data); + // to handle encode('utf8') of Python + commands.push([commandType, finalData]); + } catch (error) { + this.log.error(`CommandChannel: error on parseCommands ${error}, original: ${matches.groups["data"]}`); + throw error; + } } matches = this.commandPattern.exec(content); } @@ -119,7 +123,7 @@ export abstract class CommandChannel { const data = parsedResult[1]; const command = new Command(environment, commandType, data); this.commandEmitter.emit("command", command); - this.log.trace(`CommandChannel: env ${environment.id} emit command: ${commandType}, ${data}`); + this.log.trace(`CommandChannel: env ${environment.id} emit command: ${commandType}, ${data}.`); } } } diff --git a/src/nni_manager/training_service/reusable/jobRestServer.ts b/src/nni_manager/training_service/reusable/jobRestServer.ts deleted file mode 100644 index cb5c96fa44..0000000000 --- a/src/nni_manager/training_service/reusable/jobRestServer.ts +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) Microsoft Corporation. -// Licensed under the MIT license. - -'use strict'; - -import { EventEmitter } from 'events'; -import { Request, Response, Router } from 'express'; -import { ClusterJobRestServer } from '../common/clusterJobRestServer'; -import { Server } from 'http'; - -export interface ParameterFileMeta { - readonly experimentId: string; - readonly trialId: string; - readonly filePath: string; -} - -/** - * TODO: it should be merged into ClusterJobRestServer - */ -export class JobRestServer extends ClusterJobRestServer { - protected parameterFileMetaList: ParameterFileMeta[] = []; - - protected readonly metricsEmitter: EventEmitter; - - /** - * constructor to provide NNIRestServer's own rest property, e.g. port - */ - constructor(metricsEmitter: EventEmitter) { - super(); - this.metricsEmitter = metricsEmitter; - this.setEnableVersionCheck = true; - } - - public get Server(): Server { - return this.server; - } - - - protected handleTrialMetrics(jobId: string, metrics: any[]): void { - // Split metrics array into single metric, then emit - // Warning: If not split metrics into single ones, the behavior will be UNKNOWN - for (const singleMetric of metrics) { - this.metricsEmitter.emit('metric', { - id: jobId, - data: singleMetric - }); - } - } - - protected createRestHandler(): Router { - const router: Router = super.createRestHandler(); - - router.post(`/parameter-file-meta`, (req: Request, res: Response) => { - try { - this.log.info(`POST /parameter-file-meta, body is ${JSON.stringify(req.body)}`); - this.parameterFileMetaList.push(req.body); - res.send(); - } catch (err) { - this.log.error(`POST parameter-file-meta error: ${err}`); - res.status(500); - res.send(err.message); - } - }); - - router.get(`/parameter-file-meta`, (req: Request, res: Response) => { - try { - this.log.info(`GET /parameter-file-meta`); - res.send(this.parameterFileMetaList); - } catch (err) { - this.log.error(`GET parameter-file-meta error: ${err}`); - res.status(500); - res.send(err.message); - } - }); - - return router; - } -} diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 25c0137e31..803388d92d 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -6,12 +6,14 @@ import { EventEmitter } from 'events'; import * as fs from 'fs'; import * as path from 'path'; +import { Writable } from 'stream'; +import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; -import { getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; +import { getExperimentId, getPlatform, getBasePort } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; -import { delay, getLogLevel, getVersion, uniqueString } from '../../common/utils'; -import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, SEND_TRIAL_JOB_PARAMETER, TRIAL_END } from '../../core/commands'; +import { delay, getExperimentRootDir, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils'; +import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END } from '../../core/commands'; import { GPUSummary } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; @@ -20,10 +22,10 @@ import { validateCodeDir } from '../common/util'; import { WebCommandChannel } from './channels/webCommandChannel'; import { Command, CommandChannel } from './commandChannel'; import { EnvironmentInformation, EnvironmentService, NodeInfomation, RunnerSettings } from './environment'; -import { JobRestServer } from './jobRestServer'; import { StorageService } from './storageService'; import { TrialDetail } from './trial'; + /** * It uses to manage jobs on training platforms * and expose trial as trial job to upper level. @@ -31,11 +33,11 @@ import { TrialDetail } from './trial'; @component.Singleton class TrialDispatcher implements TrainingService { + private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?.*?)'`; private readonly log: Logger; private readonly isDeveloping: boolean = false; private stopping: boolean = false; - private jobRestServer: JobRestServer; private readonly metricsEmitter: EventEmitter; private versionCheck: boolean = true; private readonly experimentId: string; @@ -54,7 +56,6 @@ class TrialDispatcher implements TrainingService { this.trials = new Map(); this.environments = new Map(); this.metricsEmitter = new EventEmitter(); - this.jobRestServer = new JobRestServer(this.metricsEmitter); this.experimentId = getExperimentId(); this.runnerSettings = new RunnerSettings(); this.runnerSettings.experimentId = this.experimentId; @@ -158,17 +159,12 @@ class TrialDispatcher implements TrainingService { this.commandEmitter = new EventEmitter(); this.commandChannel = new WebCommandChannel(this.commandEmitter); - - await this.jobRestServer.start(); - this.jobRestServer.setEnableVersionCheck = this.versionCheck; - this.log.info(`TrialDispatcher: rest server listening on: ${this.jobRestServer.endPoint}`); - this.runnerSettings.nniManagerPort = this.jobRestServer.clusterRestServerPort; + // TODO it's a hard code of web channel, it needs to be improved. + this.runnerSettings.nniManagerPort = getBasePort() + 1; this.runnerSettings.commandChannel = this.commandChannel.channelName; // for AML channel, other channels can ignore this. this.commandChannel.config("MetricEmitter", this.metricsEmitter); - // for restful api, other channel can ignore this. - this.commandChannel.config("RestServer", this.jobRestServer.Server); // start channel this.commandEmitter.on("command", (command: Command): void => { @@ -276,13 +272,6 @@ class TrialDispatcher implements TrainingService { } } - try { - await this.jobRestServer.stop(); - this.log.info('Rest server stopped successfully.'); - } catch (error) { - this.log.error(`Rest server stopped failed, error: ${error.message}`); - } - this.commandEmitter.off("command", this.handleCommand); this.commandChannel.stop(); } @@ -510,14 +499,67 @@ class TrialDispatcher implements TrainingService { trial.environment = undefined; } + private async handleMetricData(trialId: string, data: any): Promise { + if (Array.isArray(data)) { + for (const subItem of data) { + this.metricsEmitter.emit('metric', { + id: trialId, + data: subItem + }); + } + } else { + this.metricsEmitter.emit('metric', { + id: trialId, + data: data + }); + } + } + + private async handleStdout(commandData: any): Promise { + const trialLogDir: string = path.join(getExperimentRootDir(), 'trials', commandData["trial"]); + mkDirPSync(trialLogDir); + const trialLogPath: string = path.join(trialLogDir, 'stdout_log_collection.log'); + try { + let skipLogging: boolean = false; + if (commandData["tag"] === 'trial' && commandData["msg"] !== undefined) { + const message = commandData["msg"]; + const metricsContent: any = message.match(this.NNI_METRICS_PATTERN); + if (metricsContent && metricsContent.groups) { + const key: string = 'metrics'; + const data = metricsContent.groups[key]; + const metricData = JSON.parse('"' + data.split('"').join('\\"') + '"'); + await this.handleMetricData(commandData["trial"], metricData); + skipLogging = true; + } + } + + if (!skipLogging) { + // Construct write stream to write remote trial's log into local file + const writeStream: Writable = fs.createWriteStream(trialLogPath, { + flags: 'a+', + encoding: 'utf8', + autoClose: true + }); + + writeStream.write(String.Format('{0}\n', commandData["msg"])); + writeStream.end(); + } + } catch (err) { + this.log.error(`TrialDispatcher: handleStdout error: ${err}`); + } + } + private async handleCommand(command: Command): Promise { this.log.debug(`TrialDispatcher: env ${command.environment.id} received command ${command.command}, data: ${command.data}`); const environment = command.environment; const data = command.data; const nodeId = data["node"]; switch (command.command) { - case GPU_INFO: - environment.gpuSummary.set(nodeId, (data)); + case REPORT_METRIC_DATA: + this.log.error(`TrialDispatcher: TODO: not implement to handle direct REPORT_METRIC_DATA command yet.`); + break; + case STDOUT: + await this.handleStdout(data); break; case INITIALIZED: { @@ -554,6 +596,9 @@ class TrialDispatcher implements TrainingService { } } break; + case GPU_INFO: + environment.gpuSummary.set(nodeId, (data)); + break; case TRIAL_END: { const trialId = data["trial"]; diff --git a/tools/nni_trial_tool/base_channel.py b/tools/nni_trial_tool/base_channel.py index 6bd73a8c92..c1ce564ba8 100644 --- a/tools/nni_trial_tool/base_channel.py +++ b/tools/nni_trial_tool/base_channel.py @@ -5,30 +5,10 @@ import threading import time from abc import ABC, abstractmethod -from enum import Enum from queue import Empty, Queue from .log_utils import LogType, nni_log - - -class CommandType(Enum): - Initialize = b'IN' - RequestTrialJobs = b'GE' - ReportMetricData = b'ME' - ReportGpuInfo = b'GI' - UpdateSearchSpace = b'SS' - ImportData = b'FD' - AddCustomizedTrialJob = b'AD' - TrialEnd = b'EN' - Terminate = b'TE' - Ping = b'PI' - - Initialized = b'ID' - NewTrialJob = b'TR' - SendTrialJobParameter = b'SP' - NoMoreTrialJobs = b'NO' - KillTrialJob = b'KI' - +from .commands import CommandType INTERVAL_SECONDS = 0.5 @@ -171,8 +151,4 @@ def _send_loop(self): # do nothing, if no command received. pass if message is not None: - if self.node_id is None: - nni_log(LogType.Info, 'Sending command: %s' % message) - else: - nni_log(LogType.Info, 'Sending command(%s): %s' % (self.node_id, message)) self._inner_send(message) diff --git a/tools/nni_trial_tool/commands.py b/tools/nni_trial_tool/commands.py new file mode 100644 index 0000000000..2ecb46c9c1 --- /dev/null +++ b/tools/nni_trial_tool/commands.py @@ -0,0 +1,24 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + +from enum import Enum + + +class CommandType(Enum): + Initialize = b'IN' + RequestTrialJobs = b'GE' + ReportMetricData = b'ME' + ReportGpuInfo = b'GI' + UpdateSearchSpace = b'SS' + ImportData = b'FD' + AddCustomizedTrialJob = b'AD' + TrialEnd = b'EN' + Terminate = b'TE' + Ping = b'PI' + + Initialized = b'ID' + NewTrialJob = b'TR' + SendTrialJobParameter = b'SP' + NoMoreTrialJobs = b'NO' + KillTrialJob = b'KI' + StdOut = b'SO' diff --git a/tools/nni_trial_tool/log_utils.py b/tools/nni_trial_tool/log_utils.py index 13e5141985..09cd096e46 100644 --- a/tools/nni_trial_tool/log_utils.py +++ b/tools/nni_trial_tool/log_utils.py @@ -18,6 +18,7 @@ from .rest_utils import rest_post from .url_utils import gen_send_stdout_url +from .commands import CommandType @unique @@ -43,13 +44,14 @@ def nni_log(log_type, log_message): class NNIRestLogHanlder(StreamHandler): - def __init__(self, host, port, tag, trial_id, std_output_type=StdOutputType.Stdout): + def __init__(self, host, port, tag, trial_id, channel, std_output_type=StdOutputType.Stdout): StreamHandler.__init__(self) self.host = host self.port = port self.tag = tag self.std_output_type = std_output_type self.trial_id = trial_id + self.channel = channel self.orig_stdout = sys.__stdout__ self.orig_stderr = sys.__stderr__ @@ -60,7 +62,12 @@ def emit(self, record): log_entry['msg'] = self.format(record) try: - rest_post(gen_send_stdout_url(self.host, self.port, self.trial_id), json.dumps(log_entry), 10, True) + if self.channel is None: + rest_post(gen_send_stdout_url(self.host, self.port, self.trial_id), json.dumps(log_entry), 10, True) + else: + if self.trial_id is not None: + log_entry["trial"] = self.trial_id + self.channel.send(CommandType.StdOut, log_entry) except Exception as e: self.orig_stderr.write(str(e) + '\n') self.orig_stderr.flush() @@ -71,7 +78,7 @@ class RemoteLogger(object): NNI remote logger """ - def __init__(self, syslog_host, syslog_port, tag, std_output_type, log_collection, trial_id=None, log_level=logging.INFO): + def __init__(self, syslog_host, syslog_port, tag, std_output_type, log_collection, trial_id=None, channel=None, log_level=logging.INFO): ''' constructor ''' @@ -79,7 +86,7 @@ def __init__(self, syslog_host, syslog_port, tag, std_output_type, log_collectio self.log_level = log_level self.logger.setLevel(self.log_level) self.pipeReader = None - self.handler = NNIRestLogHanlder(syslog_host, syslog_port, tag, trial_id) + self.handler = NNIRestLogHanlder(syslog_host, syslog_port, tag, trial_id, channel) self.logger.addHandler(self.handler) if std_output_type == StdOutputType.Stdout: self.orig_stdout = sys.__stdout__ diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index dd15ca2ec2..e2753b1518 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -12,7 +12,7 @@ import psutil from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log -from .base_channel import CommandType +from .commands import CommandType trial_output_path_name = ".nni" @@ -42,7 +42,7 @@ def __init__(self, args, data): def run(self): # redirect trial's stdout and stderr to syslog self.trial_syslogger_stdout = RemoteLogger(self.args.nnimanager_ip, self.args.nnimanager_port, 'trial', StdOutputType.Stdout, - self.args.log_collection, self.id) + self.args.log_collection, self.id, self.args.command_channel) nni_log(LogType.Info, "%s: start to run trial" % self.name) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 97d7c363e1..2e32cee138 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -30,17 +30,7 @@ def main_loop(args): try: trials = dict() - # init command channel - command_channel = None - if args.command_channel == "file": - command_channel = FileChannel(args) - else: - command_channel = WebChannel(args) - command_channel.open() - - nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) - args.command_channel = command_channel - + command_channel = args.command_channel # command loop while True: command_type, command_data = command_channel.receive() @@ -222,7 +212,7 @@ def run(self): from .trial import Trial from .file_channel import FileChannel from .web_channel import WebChannel - from .base_channel import CommandType + from .commands import CommandType is_multi_node = args.node_count > 1 @@ -240,8 +230,19 @@ def run(self): # node id is unique in the runner args.node_id = None + # init command channel + command_channel = None + if args.command_channel == "file": + command_channel = FileChannel(args) + else: + command_channel = WebChannel(args) + command_channel.open() + + nni_log(LogType.Info, "command channel is {}, actual type is {}".format(args.command_channel, type(command_channel))) + args.command_channel = command_channel + trial_runner_syslogger = RemoteLogger(args.nnimanager_ip, args.nnimanager_port, 'runner', - StdOutputType.Stdout, args.log_collection, args.runner_name) + StdOutputType.Stdout, args.log_collection, args.runner_name, command_channel) sys.stdout = sys.stderr = trial_runner_syslogger nni_log(LogType.Info, "{}: merged args is {}".format(args.node_id, args)) From 0f2367c9f46a99bf3209f195a24152f230ba40f4 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 21:24:29 +0800 Subject: [PATCH 53/98] support version check --- src/nni_manager/core/commands.ts | 3 +++ .../reusable/trialDispatcher.ts | 22 +++++++++++++++---- tools/nni_trial_tool/commands.py | 1 + tools/nni_trial_tool/trial_runner.py | 15 +++++++++---- 4 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/nni_manager/core/commands.ts b/src/nni_manager/core/commands.ts index 93f69cdb5e..ba1e9b3925 100644 --- a/src/nni_manager/core/commands.ts +++ b/src/nni_manager/core/commands.ts @@ -14,6 +14,7 @@ const PING = 'PI'; const GPU_INFO = 'GI'; const STDOUT = 'SO'; +const VERSION_CHECK = 'VC'; const INITIALIZED = 'ID'; const NEW_TRIAL_JOB = 'TR'; @@ -32,6 +33,7 @@ const TRIAL_COMMANDS: Set = new Set([ TRIAL_END, GPU_INFO, STDOUT, + VERSION_CHECK, ]); const TUNER_COMMANDS: Set = new Set([ @@ -71,6 +73,7 @@ export { PING, GPU_INFO, STDOUT, + VERSION_CHECK, INITIALIZED, NEW_TRIAL_JOB, NO_MORE_TRIAL_JOBS, diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 803388d92d..c9bb29bd62 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -13,7 +13,7 @@ import { getExperimentId, getPlatform, getBasePort } from '../../common/experime import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, getExperimentRootDir, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils'; -import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END } from '../../core/commands'; +import { GPU_INFO, INITIALIZED, KILL_TRIAL_JOB, NEW_TRIAL_JOB, REPORT_METRIC_DATA, SEND_TRIAL_JOB_PARAMETER, STDOUT, TRIAL_END, VERSION_CHECK } from '../../core/commands'; import { GPUSummary } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; @@ -39,9 +39,10 @@ class TrialDispatcher implements TrainingService { private stopping: boolean = false; private readonly metricsEmitter: EventEmitter; - private versionCheck: boolean = true; private readonly experimentId: string; + private enableVersionCheck: boolean = true; + private trialConfig: TrialConfig | undefined; private runnerSettings: RunnerSettings; @@ -228,8 +229,8 @@ class TrialDispatcher implements TrainingService { this.runnerSettings.nniManagerIP = (JSON.parse(value)).nniManagerIp; break; case TrialConfigMetadataKey.VERSION_CHECK: - this.versionCheck = (value === 'true' || value === 'True'); - this.runnerSettings.nniManagerVersion = this.versionCheck ? await getVersion() : ''; + this.enableVersionCheck = (value === 'true' || value === 'True'); + this.runnerSettings.nniManagerVersion = this.enableVersionCheck ? await getVersion() : ''; break; case TrialConfigMetadataKey.LOG_COLLECTION: this.runnerSettings.logCollection = value; @@ -596,6 +597,19 @@ class TrialDispatcher implements TrainingService { } } break; + case VERSION_CHECK: + { + if (this.enableVersionCheck) { + const checkResultSuccess: boolean = data["tag"] === 'VCSuccess' ? true : false; + if (checkResultSuccess) { + this.log.info(`TrialDispatcher: Version check in trialKeeper success!`); + } else { + const errorMessage = `TrialDispatcher: Version check error, ${data["msg"]}!`; + this.log.error(errorMessage); + } + } + } + break; case GPU_INFO: environment.gpuSummary.set(nodeId, (data)); break; diff --git a/tools/nni_trial_tool/commands.py b/tools/nni_trial_tool/commands.py index 2ecb46c9c1..86b10a2fe9 100644 --- a/tools/nni_trial_tool/commands.py +++ b/tools/nni_trial_tool/commands.py @@ -22,3 +22,4 @@ class CommandType(Enum): NoMoreTrialJobs = b'NO' KillTrialJob = b'KI' StdOut = b'SO' + VersionCheck = b'VC' diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 2e32cee138..5a8fc39caa 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -110,6 +110,7 @@ def check_version(args): nni_log(LogType.Warning, 'Skipping version check!') else: try: + command_channel = args.command_channel trial_runner_version = regular.search(trial_runner_version).group('version') nni_log(LogType.Info, '{0}: runner_version is {1}'.format(args.node_id, trial_runner_version)) nni_manager_version = regular.search(args.nni_manager_version).group('version') @@ -121,14 +122,14 @@ def check_version(args): args.node_id, nni_manager_version, trial_runner_version) log_entry['tag'] = 'VCFail' log_entry['msg'] = error_message - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_name), json.dumps(log_entry), 10, - False) + command_channel.send(CommandType.VersionCheck, log_entry) + while not command_channel.sent(): + time.sleep(1) os._exit(1) else: nni_log(LogType.Info, '{0}: Version match!'.format(args.node_id)) log_entry['tag'] = 'VCSuccess' - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, args.runner_name), json.dumps(log_entry), 10, - False) + command_channel.send(CommandType.VersionCheck, log_entry) except AttributeError as err: nni_log(LogType.Error, '{0}: {1}'.format(args.node_id, err)) @@ -254,6 +255,12 @@ def run(self): main_loop(args) except SystemExit as se: nni_log(LogType.Info, '{}: NNI trial runner exit with code {}'.format(args.node_id, se.code)) + + # try best to send latest errors to server + timeout = 10 + while not command_channel.sent() and timeout > 0: + timeout -= 1 + time.sleep(1) os._exit(se.code) finally: if trial_runner_syslogger is not None: From 9d7bd3cc8bb77389ac167aaae5c88e2917755636 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 21:51:09 +0800 Subject: [PATCH 54/98] fix pylint errors --- tools/nni_trial_tool/trial_runner.py | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 5a8fc39caa..87db8fda2b 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -7,7 +7,6 @@ import random import re import sys -import threading import time import traceback from datetime import datetime, timedelta @@ -133,28 +132,6 @@ def check_version(args): except AttributeError as err: nni_log(LogType.Error, '{0}: {1}'.format(args.node_id, err)) - -def fetch_parameter_file(args): - class FetchThread(threading.Thread): - def __init__(self, args): - super(FetchThread, self).__init__() - self.args = args - - def run(self): - uri = gen_parameter_meta_url(self.args.nnimanager_ip, self.args.nnimanager_port) - nni_log(LogType.Info, uri) - - while True: - res = rest_get(uri, 10) - nni_log(LogType.Debug, 'status code: {}'.format(res.status_code)) - if res.status_code != 200: - nni_log(LogType.Warning, 'rest response: {}'.format(str(res))) - time.sleep(2) - - fetch_file_thread = FetchThread(args) - fetch_file_thread.start() - - if __name__ == '__main__': '''NNI Trial Runner main function''' @@ -208,8 +185,6 @@ def run(self): os.environ['NNI_TRIAL_JOB_ID'] = "runner" from .log_utils import LogType, RemoteLogger, StdOutputType, nni_log - from .rest_utils import rest_get, rest_post - from .url_utils import gen_parameter_meta_url, gen_send_version_url from .trial import Trial from .file_channel import FileChannel from .web_channel import WebChannel From 130ed27d9c5bd7fa1768284e35146cf185c7a83a Mon Sep 17 00:00:00 2001 From: Chi Song Date: Mon, 29 Jun 2020 22:23:07 +0800 Subject: [PATCH 55/98] fix non-local failed ITs --- test/config/integration_tests.yml | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/test/config/integration_tests.yml b/test/config/integration_tests.yml index 399435d13e..86351ef7ea 100644 --- a/test/config/integration_tests.yml +++ b/test/config/integration_tests.yml @@ -49,10 +49,17 @@ testCases: maxTrialNum: 2 trialConcurrency: 1 -- name: mnist-pytorch +- name: mnist-pytorch-local configFile: test/config/examples/mnist-pytorch.yml # download data first, to prevent concurrent issue. launchCommand: python3 ../examples/trials/mnist-pytorch/mnist.py --epochs 1 --batch_num 0 --data_dir ../examples/trials/mnist-pytorch/data && nnictl create --config $configFile --debug + trainingService: local + +- name: mnist-pytorch-non-local + configFile: test/config/examples/mnist-pytorch.yml + # download data first, to prevent concurrent issue. + launchCommand: nnictl create --config $configFile --debug + trainingService: remote pai kubeflow frameworkcontroller dlts - name: mnist-annotation configFile: test/config/examples/mnist-annotation.yml From 7c48610ad0b8b98e3ef54d51365dd5a3e95cd973 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:04:33 +0800 Subject: [PATCH 56/98] format --- .../common/containerJobData.ts | 2 +- .../reusable/channels/amlCommandChannel.ts | 20 ++--- .../training_service/reusable/environment.ts | 87 ++++++++++--------- .../environments/amlEnvironmentService.ts | 1 - tools/nni_trial_tool/aml_channel.py | 1 - tools/nni_trial_tool/log_utils.py | 2 - tools/nni_trial_tool/trial.py | 4 - tools/nni_trial_tool/trial_keeper.py | 4 +- tools/nni_trial_tool/trial_runner.py | 3 +- tools/nni_trial_tool/url_utils.py | 12 +-- 10 files changed, 58 insertions(+), 78 deletions(-) diff --git a/src/nni_manager/training_service/common/containerJobData.ts b/src/nni_manager/training_service/common/containerJobData.ts index 99690f415c..f7a29f384a 100644 --- a/src/nni_manager/training_service/common/containerJobData.ts +++ b/src/nni_manager/training_service/common/containerJobData.ts @@ -10,5 +10,5 @@ if python3 -c 'import nni' > /dev/null 2>&1; then return else # Install nni - python3 -m pip install --user --no-cache-dir -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nni==1.63654 + python3 -m pip install --user --upgrade nni fi`; diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index 5536c64f55..fa83e1cbce 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -34,7 +34,6 @@ class AMLRunnerConnection extends RunnerConnection { export class AMLCommandChannel extends CommandChannel { private stopping: boolean = false; private currentMessageIndex: number = -1; - private currentMetricIndex: number = -1; // make sure no concurrent issue when sending commands. private sendQueues: [EnvironmentInformation, string][] = []; private metricEmitter: EventEmitter | undefined; @@ -50,7 +49,6 @@ export class AMLCommandChannel extends CommandChannel { public async config(_key: string, _value: any): Promise { switch (_key) { case "MetricEmitter": - console.log('------init metric emitter---------') this.metricEmitter = _value as EventEmitter; break; } @@ -90,8 +88,6 @@ export class AMLCommandChannel extends CommandChannel { if (!amlClient) { throw new Error('aml client not initialized!'); } - console.log('--------sending message-------') - console.log(message) amlClient.sendCommand(message); } } @@ -142,25 +138,21 @@ export class AMLCommandChannel extends CommandChannel { } private handleTrialMessage(environment: EnvironmentInformation, message: string) { - console.log('---------handling message-------') - console.log(message) const commands = this.parseCommands(message); if (commands.length > 0) { - console.log(commands) const commandType = commands[0][0]; if (commandType === STDOUT) { - this.handleTrialMetrics(message); + this.handleTrialMetrics(commands[0][1]); } else { - this.handleCommand(environment, commands[0][1]); + this.handleCommand(environment, message); } } } - private handleTrialMetrics(message: string): void { - let messageObj = JSON.parse(message); - let trialId = messageObj['trialId']; - let msg = messageObj['msg']; - let tag = messageObj['tag']; + private handleTrialMetrics(message: any): void { + let trialId = message['trialId']; + let msg = message['msg']; + let tag = message['tag']; if (tag === 'trial') { const metricsContent: any = msg.match(this.NNI_METRICS_PATTERN); if (metricsContent && metricsContent.groups) { diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 14db7a661d..f72d029f87 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -14,6 +14,48 @@ import { CommandChannel } from "./commandChannel"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; export type Channel = "web" | "file" | "aml" | "ut"; +export abstract class EnvironmentService { + + public abstract get hasStorageService(): boolean; + + public abstract config(key: string, value: string): Promise; + public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; + public abstract startEnvironment(environment: EnvironmentInformation): Promise; + public abstract stopEnvironment(environment: EnvironmentInformation): Promise; + + public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { + return new WebCommandChannel(commandEmitter); + } + + public createEnviornmentInfomation(envId: string, envName: string): EnvironmentInformation { + return new EnvironmentInformation(envId, envName); + } +} + +export class NodeInfomation { + public id: string; + public status: TrialJobStatus = "UNKNOWN"; + public endTime?: number; + + constructor(id: string) { + this.id = id; + } +} + +export class RunnerSettings { + public experimentId: string = ""; + public platform: string = ""; + public nniManagerIP: string = ""; + public nniManagerPort: number = 8081; + public nniManagerVersion: string = ""; + public logCollection: string = "none"; + public command: string = ""; + public enableGpuCollector: boolean = false; + + // specify which communication channel is used by runner. + // supported channel includes: rest, storage, aml + public commandChannel: Channel = "file"; +} export class EnvironmentInformation { private log: Logger; @@ -38,6 +80,9 @@ export class EnvironmentInformation { public runnerWorkingFolder: string = ""; public command: string = ""; public nodeCount: number = 1; + // aml related resource, need to refactor + public environmentLocalTempFolder: string = ""; + public environmentClient: any = ""; // it's used to aggregate node status for multiple node trial public nodes: Map; @@ -65,45 +110,3 @@ export class EnvironmentInformation { } } } -export abstract class EnvironmentService { - - public abstract get hasStorageService(): boolean; - - public abstract config(key: string, value: string): Promise; - public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; - public abstract startEnvironment(environment: EnvironmentInformation): Promise; - public abstract stopEnvironment(environment: EnvironmentInformation): Promise; - - public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { - return new WebCommandChannel(commandEmitter); - } - - public createEnviornmentInfomation(envId: string, envName: string): EnvironmentInformation { - return new EnvironmentInformation(envId, envName); - } -} - -export class NodeInfomation { - public id: string; - public status: TrialJobStatus = "UNKNOWN"; - public endTime?: number; - - constructor(id: string) { - this.id = id; - } -} - -export class RunnerSettings { - public experimentId: string = ""; - public platform: string = ""; - public nniManagerIP: string = ""; - public nniManagerPort: number = 8081; - public nniManagerVersion: string = ""; - public logCollection: string = "none"; - public command: string = ""; - public enableGpuCollector: boolean = false; - - // specify which communication channel is used by runner. - // supported channel includes: rest, storage, aml - public commandChannel: Channel = "file"; -} diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 9d212059e2..f0b8df7f6e 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -128,7 +128,6 @@ export class AMLEnvironmentService extends EnvironmentService { break; case 'COMPLETED': environment.status = 'SUCCEEDED'; - break; case 'SUCCEEDED': environment.status = 'SUCCEEDED'; break; diff --git a/tools/nni_trial_tool/aml_channel.py b/tools/nni_trial_tool/aml_channel.py index c75f569e41..61786dc25a 100644 --- a/tools/nni_trial_tool/aml_channel.py +++ b/tools/nni_trial_tool/aml_channel.py @@ -31,7 +31,6 @@ def _inner_receive(self): messages = [] # receive message is string, to get consistent result, encode it here. message_dict = self.run.get_metrics() - print(message_dict) if 'nni_manager' not in message_dict: return [] message_list = message_dict['nni_manager'] diff --git a/tools/nni_trial_tool/log_utils.py b/tools/nni_trial_tool/log_utils.py index 90e27895c1..20848d9804 100644 --- a/tools/nni_trial_tool/log_utils.py +++ b/tools/nni_trial_tool/log_utils.py @@ -16,7 +16,6 @@ from queue import Queue -from .constants import NNI_PLATFORM from .rest_utils import rest_post from .url_utils import gen_send_stdout_url from .commands import CommandType @@ -59,7 +58,6 @@ def __init__(self, host, port, tag, trial_id, channel, std_output_type=StdOutput def emit(self, record): log_entry = {} log_entry['tag'] = self.tag - log_entry['trialId'] = self.trial_id log_entry['stdOutputType'] = self.std_output_type.name log_entry['msg'] = self.format(record) diff --git a/tools/nni_trial_tool/trial.py b/tools/nni_trial_tool/trial.py index 7517793b05..e2753b1518 100644 --- a/tools/nni_trial_tool/trial.py +++ b/tools/nni_trial_tool/trial.py @@ -3,10 +3,6 @@ import ctypes import os -<<<<<<< HEAD -import logging -======= ->>>>>>> 0b9d6ce6d1cf6515c5b553a61f78cd333bf4700f import shlex import tarfile import time diff --git a/tools/nni_trial_tool/trial_keeper.py b/tools/nni_trial_tool/trial_keeper.py index e5714b006b..08688973e0 100644 --- a/tools/nni_trial_tool/trial_keeper.py +++ b/tools/nni_trial_tool/trial_keeper.py @@ -151,13 +151,13 @@ def check_version(args): nni_manager_version, trial_keeper_version) log_entry['tag'] = 'VCFail' log_entry['msg'] = error_message - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, None), json.dumps(log_entry), 10, + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, False) os._exit(1) else: nni_log(LogType.Info, 'Version match!') log_entry['tag'] = 'VCSuccess' - rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port, None), json.dumps(log_entry), 10, + rest_post(gen_send_version_url(args.nnimanager_ip, args.nnimanager_port), json.dumps(log_entry), 10, False) except AttributeError as err: nni_log(LogType.Error, err) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index cd62806865..5edd39c28c 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -9,7 +9,6 @@ import sys import time import traceback -import logging from datetime import datetime, timedelta import pkg_resources @@ -207,7 +206,7 @@ def check_version(args): else: # node id is unique in the runner args.node_id = None - + # init command channel command_channel = None if args.command_channel == "file": diff --git a/tools/nni_trial_tool/url_utils.py b/tools/nni_trial_tool/url_utils.py index 4a2d2b9c12..f6a720f221 100644 --- a/tools/nni_trial_tool/url_utils.py +++ b/tools/nni_trial_tool/url_utils.py @@ -6,20 +6,14 @@ def gen_send_stdout_url(ip, port): '''Generate send stdout url''' - if trial_id is None: - trial_id = NNI_TRIAL_JOB_ID - return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, STDOUT_API, NNI_EXP_ID, trial_id) - + return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, STDOUT_API, NNI_EXP_ID, NNI_TRIAL_JOB_ID) def gen_send_version_url(ip, port): '''Generate send error url''' - if trial_id is None: - trial_id = NNI_TRIAL_JOB_ID - return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, VERSION_API, NNI_EXP_ID, trial_id) - + return '{0}:{1}{2}{3}/{4}/{5}'.format(BASE_URL.format(ip), port, API_ROOT_URL, VERSION_API, NNI_EXP_ID, NNI_TRIAL_JOB_ID) def gen_parameter_meta_url(ip, port): '''Generate send error url''' - return '{0}:{1}{2}{3}'.format(BASE_URL.format(ip), port, API_ROOT_URL, PARAMETER_META_API) + return '{0}:{1}{2}{3}'.format(BASE_URL.format(ip), port, API_ROOT_URL, PARAMETER_META_API) \ No newline at end of file From c0c7d964b33d07878ea20d0052d41666103acb0a Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:09:14 +0800 Subject: [PATCH 57/98] format code --- src/nni_manager/common/restServer.ts | 4 ++-- .../reusable/aml/amlClient.ts | 20 ++----------------- .../reusable/channels/amlCommandChannel.ts | 20 ++----------------- .../environments/amlEnvironmentService.ts | 20 ++----------------- .../reusable/routerTrainingService.ts | 20 ++----------------- 5 files changed, 10 insertions(+), 74 deletions(-) diff --git a/src/nni_manager/common/restServer.ts b/src/nni_manager/common/restServer.ts index ed88e2d476..368aff977c 100644 --- a/src/nni_manager/common/restServer.ts +++ b/src/nni_manager/common/restServer.ts @@ -19,9 +19,9 @@ import { getBasePort } from './experimentStartupInfo'; export abstract class RestServer { private startTask!: Deferred; private stopTask!: Deferred; - + private server!: http.Server; + /** The fields can be inherited by subclass */ - protected server!: http.Server; protected hostName: string = '0.0.0.0'; protected port?: number; protected app: express.Application = express(); diff --git a/src/nni_manager/training_service/reusable/aml/amlClient.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts index e37391d491..095e5fa332 100644 --- a/src/nni_manager/training_service/reusable/aml/amlClient.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. /** +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index fa83e1cbce..de79727a7c 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. /** +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index f0b8df7f6e..c12f0e1e31 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. /** +// Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 5429a69381..9e7de3aa25 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -1,21 +1,5 @@ -/** - * Copyright (c) Microsoft Corporation - * All rights reserved. - * - * MIT License - * - * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated - * documentation files (the "Software"), to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and - * to permit persons to whom the Software is furnished to do so, subject to the following conditions: - * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED *AS IS*, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING - * BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - */ +// Copyright (c) Microsoft Corporation. /** +// Licensed under the MIT license. 'use strict'; From 93eefb231efa8218281db772835559e587222182 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:17:42 +0800 Subject: [PATCH 58/98] format code --- .../training_service/reusable/routerTrainingService.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 9e7de3aa25..20cd0aa535 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. /** +// Copyright (c) Microsoft Corporation. // Licensed under the MIT license. 'use strict'; From 53cea0f26bbd6e3f0d23321200b31f8e69ea9dd3 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:20:06 +0800 Subject: [PATCH 59/98] remove unused code --- .../reusable/aml/amlConfig.ts | 30 ------------------- .../environments/amlEnvironmentService.ts | 2 +- 2 files changed, 1 insertion(+), 31 deletions(-) diff --git a/src/nni_manager/training_service/reusable/aml/amlConfig.ts b/src/nni_manager/training_service/reusable/aml/amlConfig.ts index 9ec1c662fb..fda4d38426 100644 --- a/src/nni_manager/training_service/reusable/aml/amlConfig.ts +++ b/src/nni_manager/training_service/reusable/aml/amlConfig.ts @@ -37,36 +37,6 @@ export class AMLTrialConfig extends TrialConfig { } } -/** - * AML trial job detail - */ -export class AMLTrialJobDetail implements TrialJobDetail { - public id: string; - public status: TrialJobStatus; - public amlJobName: string; - public submitTime: number; - public startTime?: number; - public endTime?: number; - public tags?: string[]; - public url?: string; - public workingDirectory: string; - public form: TrialJobApplicationForm; - public logPath: string; - public isEarlyStopped?: boolean; - - constructor(id: string, status: TrialJobStatus, amlJobName: string, - submitTime: number, workingDirectory: string, form: TrialJobApplicationForm, logPath: string) { - this.id = id; - this.status = status; - this.amlJobName = amlJobName; - this.submitTime = submitTime; - this.workingDirectory = workingDirectory; - this.form = form; - this.tags = []; - this.logPath = logPath; - } -} - export class AMLEnvironmentInformation extends EnvironmentInformation { public amlClient?: AMLClient; } diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index c12f0e1e31..fd6a3b5524 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -10,7 +10,7 @@ import * as component from '../../../common/component'; import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../../common/log'; import { TrialConfigMetadataKey } from '../../common/trialConfigMetadataKey'; -import { AMLClusterConfig, AMLTrialConfig, AMLTrialJobDetail } from '../aml/amlConfig'; +import { AMLClusterConfig, AMLTrialConfig } from '../aml/amlConfig'; import { EnvironmentInformation, EnvironmentService } from '../environment'; import { AMLEnvironmentInformation } from '../aml/amlConfig'; import { AMLClient } from '../aml/amlClient'; From 34d935139b1323f4177bf92254b5e81b459ebc0f Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:33:46 +0800 Subject: [PATCH 60/98] format code --- src/nni_manager/training_service/reusable/aml/amlClient.ts | 2 +- .../training_service/reusable/channels/amlCommandChannel.ts | 2 +- .../reusable/environments/amlEnvironmentService.ts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/aml/amlClient.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts index 095e5fa332..2e9c49bf74 100644 --- a/src/nni_manager/training_service/reusable/aml/amlClient.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. /** +// Copyright (c) Microsoft Corporation. // Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index de79727a7c..ee8a33b9ba 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. /** +// Copyright (c) Microsoft Corporation. // Licensed under the MIT license. 'use strict'; diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index fd6a3b5524..e85d939bc5 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -1,4 +1,4 @@ -// Copyright (c) Microsoft Corporation. /** +// Copyright (c) Microsoft Corporation. // Licensed under the MIT license. 'use strict'; From 25a9dab67365478fc71c58e1e54838c7edc450d1 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:39:32 +0800 Subject: [PATCH 61/98] fix comments --- deployment/pypi/setup.py | 2 -- docs/en_US/TrainingService/AMLMode.md | 8 +++++++- setup.py | 2 -- tools/setup.py | 2 -- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/deployment/pypi/setup.py b/deployment/pypi/setup.py index 8985e90ead..61f7ff0178 100644 --- a/deployment/pypi/setup.py +++ b/deployment/pypi/setup.py @@ -50,8 +50,6 @@ package_data = {'nni': ['**/requirements.txt']}, python_requires = '>=3.5', install_requires = [ - 'azureml', - 'azureml-sdk', 'schema', 'ruamel.yaml', 'psutil', diff --git a/docs/en_US/TrainingService/AMLMode.md b/docs/en_US/TrainingService/AMLMode.md index f9ae38bcf5..dd9fcaf40a 100644 --- a/docs/en_US/TrainingService/AMLMode.md +++ b/docs/en_US/TrainingService/AMLMode.md @@ -5,11 +5,17 @@ NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/se ## Setup environment Step 1. Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). -Step 2. Create AML account, follow the document [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli) +Step 2. Create AML account, follow the document [here](https://docs.microsoft.com/en-us/azure/machine-learning/how-to-manage-workspace-cli). Step 3. Get your account information. ![](../../img/aml_account.png) +Step4. Install AML package environment. +``` +python3 -m pip install azureml --user +python3 -m pip install azureml-sdk --user +``` + ## Run an experiment Use `examples/trials/mnist-tfv1` as an example. The NNI config YAML file's content is like: diff --git a/setup.py b/setup.py index c1da2ddf08..30d4f448c6 100644 --- a/setup.py +++ b/setup.py @@ -30,8 +30,6 @@ def read(fname): python_requires = '>=3.5', install_requires = [ 'astor', - 'azureml', - 'azureml-sdk', 'hyperopt==0.1.2', 'json_tricks', 'netifaces', diff --git a/tools/setup.py b/tools/setup.py index 7e1330f2a7..48d6923dca 100644 --- a/tools/setup.py +++ b/tools/setup.py @@ -10,8 +10,6 @@ python_requires = '>=3.5', install_requires = [ - 'azureml', - 'azureml-sdk', 'requests', 'ruamel.yaml', 'psutil', From cada76ad162f2fe49f05b5c486eaa9247be2437e Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:40:18 +0800 Subject: [PATCH 62/98] fix comments --- docs/en_US/TrainingService/AMLMode.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en_US/TrainingService/AMLMode.md b/docs/en_US/TrainingService/AMLMode.md index dd9fcaf40a..ab12e2d05a 100644 --- a/docs/en_US/TrainingService/AMLMode.md +++ b/docs/en_US/TrainingService/AMLMode.md @@ -1,6 +1,6 @@ **Run an Experiment on Azure Machine Learning** === -NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) , called aml mode. Before starting to use NNI pai mode, you should have an account to access an [OpenPAI](https://github.com/Microsoft/pai) cluster. See [here](https://github.com/Microsoft/pai#how-to-deploy) if you don't have any OpenPAI account and want to deploy an OpenPAI cluster. In pai mode, your trial program will run in pai's container created by Docker. +NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) , called aml mode. ## Setup environment Step 1. Install NNI, follow the install guide [here](../Tutorial/QuickStart.md). From de7dc7c71d4172e8af9a2f38a44a23137aef1b95 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:47:19 +0800 Subject: [PATCH 63/98] fix comments --- docs/en_US/TrainingService/AMLMode.md | 6 +++--- examples/trials/mnist-tfv1/config_aml.yml | 2 +- src/nni_manager/config/aml/amlUtil.py | 5 ++++- src/nni_manager/rest_server/restValidationSchemas.ts | 2 +- .../training_service/reusable/aml/amlClient.ts | 8 ++++---- .../training_service/reusable/aml/amlConfig.ts | 6 +++--- .../reusable/environments/amlEnvironmentService.ts | 2 +- tools/nni_cmd/config_schema.py | 2 +- 8 files changed, 18 insertions(+), 15 deletions(-) diff --git a/docs/en_US/TrainingService/AMLMode.md b/docs/en_US/TrainingService/AMLMode.md index ab12e2d05a..30c748adfb 100644 --- a/docs/en_US/TrainingService/AMLMode.md +++ b/docs/en_US/TrainingService/AMLMode.md @@ -39,7 +39,7 @@ tuner: trial: command: python3 mnist.py codeDir: . - computerTarget: ussc40rscl + computeTarget: ussc40rscl nodeCount: 1 amlConfig: subscriptionId: ${replace_to_your_subscriptionId} @@ -51,10 +51,10 @@ amlConfig: Note: You should set `trainingServicePlatform: aml` in NNI config YAML file if you want to start experiment in aml mode. Compared with [LocalMode](LocalMode.md) trial configuration in aml mode have these additional keys: -* computerTarget +* computeTarget * required key. The computer cluster name you want to use in your AML workspace. * nodeCount - * required key. The node count each run in your experiment. + * required key. The number of nodes to use for one run. [refer](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py#variables) amlConfig: * subscriptionId diff --git a/examples/trials/mnist-tfv1/config_aml.yml b/examples/trials/mnist-tfv1/config_aml.yml index 98af856430..1f1f471e77 100644 --- a/examples/trials/mnist-tfv1/config_aml.yml +++ b/examples/trials/mnist-tfv1/config_aml.yml @@ -17,7 +17,7 @@ tuner: trial: command: python3 mnist.py codeDir: . - computerTarget: ussc40rscl + computeTarget: ussc40rscl nodeCount: 1 amlConfig: subscriptionId: ${replace_to_your_subscriptionId} diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index ddf3303522..50380d0f7e 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT license. + import os import sys import time @@ -51,4 +54,4 @@ elif line: items = line.split(':') if items[0] == 'command': - run.log('nni_manager', line[8:]) \ No newline at end of file + run.log('nni_manager', line[8:]) diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 8704495159..7cee7611de 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -39,7 +39,7 @@ export namespace ValidationSchemas { nniManagerNFSMountPath: joi.string().min(1), containerNFSMountPath: joi.string().min(1), paiConfigPath: joi.string(), - computerTarget: joi.string(), + computeTarget: joi.string(), nodeCount: joi.number(), paiStorageConfigName: joi.string().min(1), nasMode: joi.string().valid('classic_mode', 'enas_mode', 'oneshot_mode', 'darts_mode'), diff --git a/src/nni_manager/training_service/reusable/aml/amlClient.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts index 2e9c49bf74..557743a175 100644 --- a/src/nni_manager/training_service/reusable/aml/amlClient.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -19,14 +19,14 @@ export class AMLClient { public pythonShellClient: undefined | PythonShell; public codeDir: string; public nodeCount: number; - public computerTarget: string; + public computeTarget: string; constructor( subscriptionId: string, resourceGroup: string, workspaceName: string, experimentId: string, - computerTarget: string, + computeTarget: string, nodeCount: number, image: string, scriptName: string, @@ -40,7 +40,7 @@ export class AMLClient { this.nodeCount = nodeCount; this.scriptName = scriptName; this.codeDir = codeDir; - this.computerTarget = computerTarget; + this.computeTarget = computeTarget; } public async submit(): Promise { @@ -52,7 +52,7 @@ export class AMLClient { '--subscription_id', this.subscriptionId, '--resource_group', this.resourceGroup, '--workspace_name', this.workspaceName, - '--computer_target', this.computerTarget, + '--computer_target', this.computeTarget, '--docker_image', this.image, '--experiment_name', `nni_exp_${this.experimentId}`, '--script_dir', this.codeDir, diff --git a/src/nni_manager/training_service/reusable/aml/amlConfig.ts b/src/nni_manager/training_service/reusable/aml/amlConfig.ts index fda4d38426..de0f3ce2ca 100644 --- a/src/nni_manager/training_service/reusable/aml/amlConfig.ts +++ b/src/nni_manager/training_service/reusable/aml/amlConfig.ts @@ -25,15 +25,15 @@ export class AMLTrialConfig extends TrialConfig { public readonly command: string; public readonly codeDir: string; public readonly nodeCount: number; - public readonly computerTarget: string; + public readonly computeTarget: string; - constructor(codeDir: string, command: string, image: string, nodeCount: number, computerTarget: string) { + constructor(codeDir: string, command: string, image: string, nodeCount: number, computeTarget: string) { super("", codeDir, 0); this.codeDir = codeDir; this.command = command; this.image = image; this.nodeCount = nodeCount; - this.computerTarget = computerTarget; + this.computeTarget = computeTarget; } } diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index e85d939bc5..464f9eef84 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -146,7 +146,7 @@ export class AMLEnvironmentService extends EnvironmentService { this.amlClusterConfig.resourceGroup, this.amlClusterConfig.workspaceName, this.experimentId, - this.amlTrialConfig.computerTarget, + this.amlTrialConfig.computeTarget, this.amlTrialConfig.nodeCount, this.amlTrialConfig.image, 'nni_script.py', diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index a25d4e0cf4..cc96682dbc 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -239,7 +239,7 @@ def validate(self, data): 'codeDir': setPathCheck('codeDir'), 'command': setType('command', str), 'image': setType('image', str), - 'computerTarget': setType('computerTarget', str), + 'computeTarget': setType('computeTarget', str), 'nodeCount': setType('nodeCount', int) } } From 428dc3d5d9c3ad332bf5b5122d7801c2d72331d0 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 14:49:10 +0800 Subject: [PATCH 64/98] add blank line --- src/nni_manager/training_service/reusable/aml/amlClient.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/aml/amlClient.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts index 557743a175..8580505cec 100644 --- a/src/nni_manager/training_service/reusable/aml/amlClient.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -129,4 +129,4 @@ export class AMLClient { }); return deferred.promise; } -} \ No newline at end of file +} From 2e9c70e97e2ba833a9a793dc323ae235427e8e79 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 15:32:24 +0800 Subject: [PATCH 65/98] fix comments --- src/nni_manager/config/aml/amlUtil.py | 4 ++-- src/nni_manager/training_service/reusable/aml/amlClient.ts | 6 +++--- .../training_service/reusable/channels/amlCommandChannel.ts | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index 50380d0f7e..16099dbc74 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -17,7 +17,7 @@ parser.add_argument('--subscription_id', help='the subscription id of aml') parser.add_argument('--resource_group', help='the resource group of aml') parser.add_argument('--workspace_name', help='the workspace name of aml') - parser.add_argument('--computer_target', help='the computer cluster name of aml') + parser.add_argument('--compute_target', help='the compute cluster name of aml') parser.add_argument('--docker_image', help='the docker image of job') parser.add_argument('--experiment_name', help='the experiment name') parser.add_argument('--script_dir', help='script directory') @@ -26,7 +26,7 @@ args = parser.parse_args() ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) - compute_target = ComputeTarget(workspace=ws, name=args.computer_target) + compute_target = ComputeTarget(workspace=ws, name=args.compute_target) experiment = Experiment(ws, args.experiment_name) run_config = RunConfiguration() dependencies = CondaDependencies() diff --git a/src/nni_manager/training_service/reusable/aml/amlClient.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts index 8580505cec..3786f434f8 100644 --- a/src/nni_manager/training_service/reusable/aml/amlClient.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -43,7 +43,7 @@ export class AMLClient { this.computeTarget = computeTarget; } - public async submit(): Promise { + public submit(): Promise { const deferred: Deferred = new Deferred(); this.pythonShellClient = new PythonShell('amlUtil.py', { scriptPath: './config/aml', @@ -52,7 +52,7 @@ export class AMLClient { '--subscription_id', this.subscriptionId, '--resource_group', this.resourceGroup, '--workspace_name', this.workspaceName, - '--computer_target', this.computeTarget, + '--compute_target', this.computeTarget, '--docker_image', this.image, '--experiment_name', `nni_exp_${this.experimentId}`, '--script_dir', this.codeDir, @@ -67,7 +67,7 @@ export class AMLClient { return deferred.promise; } - public stop() { + public stop(): void { if (this.pythonShellClient === undefined) { throw Error('python shell client not initialized!'); } diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index ee8a33b9ba..0d30b38719 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -18,7 +18,6 @@ class AMLRunnerConnection extends RunnerConnection { export class AMLCommandChannel extends CommandChannel { private stopping: boolean = false; private currentMessageIndex: number = -1; - // make sure no concurrent issue when sending commands. private sendQueues: [EnvironmentInformation, string][] = []; private metricEmitter: EventEmitter | undefined; private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?.*?)'`; @@ -40,8 +39,9 @@ export class AMLCommandChannel extends CommandChannel { public async start(): Promise { // start command loops - this.receiveLoop(); - this.sendLoop(); + await Promise.all([ + this.receiveLoop(), + this.sendLoop()]); } public async stop(): Promise { From 8cf8583b814c65877d55e404e22750ffa193eb52 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 15:34:44 +0800 Subject: [PATCH 66/98] fix comments --- .../training_service/reusable/channels/amlCommandChannel.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index 0d30b38719..81ef7ee585 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -121,7 +121,7 @@ export class AMLCommandChannel extends CommandChannel { } } - private handleTrialMessage(environment: EnvironmentInformation, message: string) { + private handleTrialMessage(environment: EnvironmentInformation, message: string): void { const commands = this.parseCommands(message); if (commands.length > 0) { const commandType = commands[0][0]; From fd5fd9e46e1353e7f466cb99bb775aa4b40d4bec Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 15:37:21 +0800 Subject: [PATCH 67/98] fix build --- .../training_service/reusable/routerTrainingService.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/routerTrainingService.ts b/src/nni_manager/training_service/reusable/routerTrainingService.ts index 20cd0aa535..1fd28604be 100644 --- a/src/nni_manager/training_service/reusable/routerTrainingService.ts +++ b/src/nni_manager/training_service/reusable/routerTrainingService.ts @@ -10,7 +10,6 @@ import { TrainingService, TrialJobApplicationForm, TrialJobDetail, TrialJobMetri import { delay } from '../../common/utils'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; import { PAIClusterConfig } from '../pai/paiConfig'; -import { AMLClusterConfig } from '../aml/amlConfig'; import { PAIK8STrainingService } from '../pai/paiK8S/paiK8STrainingService'; import { EnvironmentService } from './environment'; import { OpenPaiEnvironmentService } from './environments/openPaiEnvironmentService'; From 54a22af64d8cc2b8342912d6aae70d1d7e83d4ff Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 15:39:33 +0800 Subject: [PATCH 68/98] fix comments --- .../reusable/environments/amlEnvironmentService.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 464f9eef84..7ce3ae85d7 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -98,7 +98,6 @@ export class AMLEnvironmentService extends EnvironmentService { } public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { - const deferred: Deferred = new Deferred(); environments.forEach(async (environment) => { let amlClient = (environment as AMLEnvironmentInformation).amlClient; if (!amlClient) { @@ -126,8 +125,6 @@ export class AMLEnvironmentService extends EnvironmentService { environment.status = 'UNKNOWN'; } }); - deferred.resolve(); - return deferred.promise; } public async startEnvironment(environment: EnvironmentInformation): Promise { From 525b961218cc1c28ea84fd190452704bdf1f2fdd Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 30 Jun 2020 15:40:31 +0800 Subject: [PATCH 69/98] fix channel async calls --- .../training_service/reusable/trialDispatcher.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index c9bb29bd62..039e9c2979 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -165,7 +165,7 @@ class TrialDispatcher implements TrainingService { this.runnerSettings.commandChannel = this.commandChannel.channelName; // for AML channel, other channels can ignore this. - this.commandChannel.config("MetricEmitter", this.metricsEmitter); + await this.commandChannel.config("MetricEmitter", this.metricsEmitter); // start channel this.commandEmitter.on("command", (command: Command): void => { @@ -173,7 +173,7 @@ class TrialDispatcher implements TrainingService { this.log.error(`TrialDispatcher: error on handle env ${command.environment.id} command: ${command.command}, data: ${command.data}, error: ${err}`); }) }); - this.commandChannel.start(); + await this.commandChannel.start(); this.log.info(`TrialDispatcher: started channel: ${this.commandChannel.constructor.name}`); if (this.trialConfig === undefined) { @@ -274,7 +274,7 @@ class TrialDispatcher implements TrainingService { } this.commandEmitter.off("command", this.handleCommand); - this.commandChannel.stop(); + await this.commandChannel.stop(); } private async environmentMaintenanceLoop(): Promise { From 8ec5e7daf3acbc834df1340a64c5efb09d0bd097 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 16:03:59 +0800 Subject: [PATCH 70/98] fix comments --- examples/trials/mnist-pytorch/config_aml.yml | 25 +++++++++++++ .../reusable/channels/amlCommandChannel.ts | 36 ++----------------- .../reusable/trialDispatcher.ts | 7 ++-- 3 files changed, 29 insertions(+), 39 deletions(-) create mode 100644 examples/trials/mnist-pytorch/config_aml.yml diff --git a/examples/trials/mnist-pytorch/config_aml.yml b/examples/trials/mnist-pytorch/config_aml.yml new file mode 100644 index 0000000000..be0c31c93c --- /dev/null +++ b/examples/trials/mnist-pytorch/config_aml.yml @@ -0,0 +1,25 @@ +authorName: default +experimentName: example_mnist_pytorch +trialConcurrency: 1 +maxExecDuration: 1h +maxTrialNum: 10 +trainingServicePlatform: aml +searchSpacePath: search_space.json +#choice: true, false +useAnnotation: false +tuner: + #choice: TPE, Random, Anneal, Evolution, BatchTuner, MetisTuner, GPTuner + #SMAC (SMAC should be installed through nnictl) + builtinTunerName: TPE + classArgs: + #choice: maximize, minimize + optimize_mode: maximize +trial: + command: python3 mnist.py + codeDir: . + computeTarget: ussc40rscl + nodeCount: 1 +amlConfig: + subscriptionId: ${replace_to_your_subscriptionId} + resourceGroup: ${replace_to_your_resourceGroup} + workspaceName: ${replace_to_your_workspaceName} diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index 81ef7ee585..ea87a81cd4 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -102,11 +102,11 @@ export class AMLCommandChannel extends CommandChannel { if (messages) { if (messages instanceof Object && this.currentMessageIndex < messages.length - 1) { for (let index = this.currentMessageIndex + 1; index < messages.length; index ++) { - this.handleTrialMessage(runnerConnection.environment, messages[index].toString()); + this.handleCommand(runnerConnection.environment, messages[index]); } this.currentMessageIndex = messages.length - 1; } else if (this.currentMessageIndex === -1){ - this.handleTrialMessage(runnerConnection.environment, messages.toString()); + this.handleCommand(runnerConnection.environment, messages); this.currentMessageIndex += 1; } } @@ -120,36 +120,4 @@ export class AMLCommandChannel extends CommandChannel { } } } - - private handleTrialMessage(environment: EnvironmentInformation, message: string): void { - const commands = this.parseCommands(message); - if (commands.length > 0) { - const commandType = commands[0][0]; - if (commandType === STDOUT) { - this.handleTrialMetrics(commands[0][1]); - } else { - this.handleCommand(environment, message); - } - } - } - - private handleTrialMetrics(message: any): void { - let trialId = message['trialId']; - let msg = message['msg']; - let tag = message['tag']; - if (tag === 'trial') { - const metricsContent: any = msg.match(this.NNI_METRICS_PATTERN); - if (metricsContent && metricsContent.groups) { - const key: string = 'metrics'; - const metric = metricsContent.groups[key]; - if (!this.metricEmitter) { - throw Error('metricEmitter not initialized'); - } - this.metricEmitter.emit('metric', { - id: trialId, - data: metric - }); - } - } - } } diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 105f1e4716..0ec04f3bdd 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -341,6 +341,7 @@ class TrialDispatcher implements TrainingService { toRefreshedTrials.push(trial); } } + if (toRefreshedTrials.length == 0) { continue; } @@ -447,10 +448,6 @@ class TrialDispatcher implements TrainingService { const envName = `nni_exp_${this.experimentId}_env_${envId}`; const environment = environmentService.createEnviornmentInfomation(envId, envName); - if (this.trialConfig === undefined) { - throw new Error(`trial config shouldn't be undefined in run()`); - } - environment.command = `sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; if (this.isDeveloping) { @@ -462,7 +459,7 @@ class TrialDispatcher implements TrainingService { environment.workingFolder = storageService.joinPath("envs", envId); await storageService.createDirectory(environment.workingFolder); } else { - environment.command = `cd envs && sh install_nni.sh && mkdir ${envId} && cd ${envId} && python3 -m nni_trial_tool.trial_runner`; + environment.command = `mkdir envs/${envId} && cd envs/${envId} && sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; } await environmentService.startEnvironment(environment); From bdd3840833636e46c7379364be29acc436d558c8 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 16:04:55 +0800 Subject: [PATCH 71/98] fix comments --- .../training_service/reusable/channels/amlCommandChannel.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index ea87a81cd4..b8f87122f5 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -39,9 +39,8 @@ export class AMLCommandChannel extends CommandChannel { public async start(): Promise { // start command loops - await Promise.all([ - this.receiveLoop(), - this.sendLoop()]); + this.receiveLoop(); + this.sendLoop(); } public async stop(): Promise { From b341dce5bb4aa64d60aa9d0c491fbd19f4ffb362 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 16:10:49 +0800 Subject: [PATCH 72/98] fix comments --- src/nni_manager/training_service/reusable/trialDispatcher.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 0ec04f3bdd..c8aa5008f6 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -459,7 +459,7 @@ class TrialDispatcher implements TrainingService { environment.workingFolder = storageService.joinPath("envs", envId); await storageService.createDirectory(environment.workingFolder); } else { - environment.command = `mkdir envs/${envId} && cd envs/${envId} && sh ../install_nni.sh && python3 -m nni_trial_tool.trial_runner`; + environment.command = `mkdir envs/${envId} && cd envs/${envId} && ${environment.command}`; } await environmentService.startEnvironment(environment); From e66dc23b064ae642f051c48e5b40279c68b512df Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 16:33:48 +0800 Subject: [PATCH 73/98] fix comments --- tools/nni_trial_tool/url_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_trial_tool/url_utils.py b/tools/nni_trial_tool/url_utils.py index f6a720f221..7942c62fb5 100644 --- a/tools/nni_trial_tool/url_utils.py +++ b/tools/nni_trial_tool/url_utils.py @@ -16,4 +16,4 @@ def gen_send_version_url(ip, port): def gen_parameter_meta_url(ip, port): '''Generate send error url''' - return '{0}:{1}{2}{3}'.format(BASE_URL.format(ip), port, API_ROOT_URL, PARAMETER_META_API) \ No newline at end of file + return '{0}:{1}{2}{3}'.format(BASE_URL.format(ip), port, API_ROOT_URL, PARAMETER_META_API) From 9cf6744e6f9d59158f2884d73ca8f726ba07da37 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 30 Jun 2020 16:45:19 +0800 Subject: [PATCH 74/98] merge code logic --- .../environments/openPaiEnvironmentService.ts | 5 +- .../reusable/trialDispatcher.ts | 67 +++++++++---------- 2 files changed, 33 insertions(+), 39 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts index ba8a3f4473..0d935a6a37 100644 --- a/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/openPaiEnvironmentService.ts @@ -167,8 +167,9 @@ export class OpenPaiEnvironmentService extends EnvironmentService { } // Step 1. Prepare PAI job configuration - environment.runnerWorkingFolder = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}/envs/${environment.id}`; - environment.command = `cd ${environment.runnerWorkingFolder} && ${environment.command}` + const environmentRoot = `${this.paiTrialConfig.containerNFSMountPath}/${this.experimentId}`; + environment.runnerWorkingFolder = `${environmentRoot}/envs/${environment.id}`; + environment.command = `cd ${environmentRoot} && ${environment.command}` environment.trackingUrl = `${this.protocol}://${this.paiClusterConfig.host}/job-detail.html?username=${this.paiClusterConfig.userName}&jobName=${environment.jobId}` // Step 2. Generate Job Configuration in yaml format diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 3adeff9c9f..407b020b4a 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -25,6 +25,7 @@ import { Command, CommandChannel } from './commandChannel'; import { EnvironmentInformation, EnvironmentService, NodeInfomation, RunnerSettings } from './environment'; import { StorageService } from './storageService'; import { TrialDetail } from './trial'; +import { MountedStorageService } from './storages/mountedStorageService'; /** @@ -184,38 +185,36 @@ class TrialDispatcher implements TrainingService { throw new Error(`trial config shouldn't be undefined in run()`); } + this.log.info(`TrialDispatcher: copying code and settings.`); + let storageService: StorageService; if (environmentService.hasStorageService) { - this.log.info(`TrialDispatcher: copying code and settings.`); - const storageService = component.get(StorageService); - // Copy the compressed file to remoteDirectory and delete it - const codeDir = path.resolve(this.trialConfig.codeDir); - const envDir = storageService.joinPath("envs"); - const codeFileName = await storageService.copyDirectory(codeDir, envDir, true); - storageService.rename(codeFileName, "nni-code.tar.gz"); - - const installFileName = storageService.joinPath(envDir, 'install_nni.sh'); - await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName); - - const runnerSettings = storageService.joinPath(envDir, "settings.json"); - await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); - - if (this.isDeveloping) { - let trialToolsPath = path.join(__dirname, "../../../../../tools/nni_trial_tool"); - if (false === fs.existsSync(trialToolsPath)) { - trialToolsPath = path.join(__dirname, "..\\..\\..\\..\\..\\tools\\nni_trial_tool"); - } - await storageService.copyDirectory(trialToolsPath, envDir, true); - } + this.log.debug(`TrialDispatcher: use existing storage service.`); + storageService = component.get(StorageService); } else { - //write configuration to local folder, for AML - let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp", "envs"); - await execMkdir(environmentLocalTempFolder); - const runnerSettingsPath = path.join(environmentLocalTempFolder, "settings.json"); - this.runnerSettings.command = this.trialConfig.command; - await fs.promises.writeFile(runnerSettingsPath, JSON.stringify(this.runnerSettings), { encoding: 'utf8' }); - const installFilePath = path.join(environmentLocalTempFolder, "install_nni.sh"); - await fs.promises.writeFile(installFilePath, CONTAINER_INSTALL_NNI_SHELL_FORMAT, { encoding: 'utf8' }); - await tarAdd(path.join(environmentLocalTempFolder, 'nni-code.tar.gz'), this.trialConfig.codeDir); + this.log.debug(`TrialDispatcher: create temp storage service to temp folder.`); + storageService = new MountedStorageService(); + let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); + storageService.initialize(this.trialConfig.codeDir, environmentLocalTempFolder); + } + + // Copy the compressed file to remoteDirectory and delete it + const codeDir = path.resolve(this.trialConfig.codeDir); + const envDir = storageService.joinPath("envs"); + const codeFileName = await storageService.copyDirectory(codeDir, envDir, true); + storageService.rename(codeFileName, "nni-code.tar.gz"); + + const installFileName = storageService.joinPath(envDir, 'install_nni.sh'); + await storageService.save(CONTAINER_INSTALL_NNI_SHELL_FORMAT, installFileName); + + const runnerSettings = storageService.joinPath(envDir, "settings.json"); + await storageService.save(JSON.stringify(this.runnerSettings), runnerSettings); + + if (this.isDeveloping) { + let trialToolsPath = path.join(__dirname, "../../../../../tools/nni_trial_tool"); + if (false === fs.existsSync(trialToolsPath)) { + trialToolsPath = path.join(__dirname, "..\\..\\..\\..\\..\\tools\\nni_trial_tool"); + } + await storageService.copyDirectory(trialToolsPath, envDir, true); } this.log.info(`TrialDispatcher: run loop started.`); @@ -454,13 +453,7 @@ class TrialDispatcher implements TrainingService { environment.command = "[ -d \"nni_trial_tool\" ] && echo \"nni_trial_tool exists already\" || (mkdir ./nni_trial_tool && tar -xof ../nni_trial_tool.tar.gz -C ./nni_trial_tool) && pip3 install websockets && " + environment.command; } - if (environmentService.hasStorageService) { - const storageService = component.get(StorageService); - environment.workingFolder = storageService.joinPath("envs", envId); - await storageService.createDirectory(environment.workingFolder); - } else { - environment.command = `mkdir envs/${envId} && cd envs/${envId} && ${environment.command}`; - } + environment.command = `mkdir -p envs/${envId} && cd envs/${envId} && ${environment.command}`; await environmentService.startEnvironment(environment); this.environments.set(environment.id, environment); From 51befa571a9a2b009058690ef65924ea4be156e5 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 30 Jun 2020 17:18:50 +0800 Subject: [PATCH 75/98] fix eslint errors --- .../training_service/reusable/environment.ts | 36 +++++++++---------- .../reusable/trialDispatcher.ts | 10 +++--- 2 files changed, 22 insertions(+), 24 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index f72d029f87..98bcd43323 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -14,24 +14,6 @@ import { CommandChannel } from "./commandChannel"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; export type Channel = "web" | "file" | "aml" | "ut"; -export abstract class EnvironmentService { - - public abstract get hasStorageService(): boolean; - - public abstract config(key: string, value: string): Promise; - public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; - public abstract startEnvironment(environment: EnvironmentInformation): Promise; - public abstract stopEnvironment(environment: EnvironmentInformation): Promise; - - public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { - return new WebCommandChannel(commandEmitter); - } - - public createEnviornmentInfomation(envId: string, envName: string): EnvironmentInformation { - return new EnvironmentInformation(envId, envName); - } -} - export class NodeInfomation { public id: string; public status: TrialJobStatus = "UNKNOWN"; @@ -110,3 +92,21 @@ export class EnvironmentInformation { } } } + +export abstract class EnvironmentService { + + public abstract get hasStorageService(): boolean; + + public abstract config(key: string, value: string): Promise; + public abstract refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise; + public abstract startEnvironment(environment: EnvironmentInformation): Promise; + public abstract stopEnvironment(environment: EnvironmentInformation): Promise; + + public getCommandChannel(commandEmitter: EventEmitter): CommandChannel { + return new WebCommandChannel(commandEmitter); + } + + public createEnviornmentInfomation(envId: string, envName: string): EnvironmentInformation { + return new EnvironmentInformation(envId, envName); + } +} diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 407b020b4a..7743fced01 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -9,7 +9,7 @@ import * as path from 'path'; import { Writable } from 'stream'; import { String } from 'typescript-string-operations'; import * as component from '../../common/component'; -import { getExperimentId, getPlatform, getBasePort } from '../../common/experimentStartupInfo'; +import { getBasePort, getExperimentId, getPlatform } from '../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../common/log'; import { NNIManagerIpConfig, TrainingService, TrialJobApplicationForm, TrialJobMetric, TrialJobStatus } from '../../common/trainingService'; import { delay, getExperimentRootDir, getLogLevel, getVersion, mkDirPSync, uniqueString } from '../../common/utils'; @@ -18,14 +18,12 @@ import { GPUSummary } from '../../training_service/common/gpuData'; import { CONTAINER_INSTALL_NNI_SHELL_FORMAT } from '../common/containerJobData'; import { TrialConfig } from '../common/trialConfig'; import { TrialConfigMetadataKey } from '../common/trialConfigMetadataKey'; -import { validateCodeDir, execMkdir, execCopydir, tarAdd } from '../common/util'; -import { WebCommandChannel } from './channels/webCommandChannel'; -import { AMLCommandChannel } from './channels/amlCommandChannel'; +import { validateCodeDir } from '../common/util'; import { Command, CommandChannel } from './commandChannel'; import { EnvironmentInformation, EnvironmentService, NodeInfomation, RunnerSettings } from './environment'; +import { MountedStorageService } from './storages/mountedStorageService'; import { StorageService } from './storageService'; import { TrialDetail } from './trial'; -import { MountedStorageService } from './storages/mountedStorageService'; /** @@ -193,7 +191,7 @@ class TrialDispatcher implements TrainingService { } else { this.log.debug(`TrialDispatcher: create temp storage service to temp folder.`); storageService = new MountedStorageService(); - let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); + const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); storageService.initialize(this.trialConfig.codeDir, environmentLocalTempFolder); } From 478629f291217e62a3a955bc3711d0a0e85b07f7 Mon Sep 17 00:00:00 2001 From: Chi Song Date: Tue, 30 Jun 2020 17:48:50 +0800 Subject: [PATCH 76/98] add run fo messages --- .../reusable/channels/amlCommandChannel.ts | 28 +++++++++---------- .../reusable/channels/fileCommandChannel.ts | 14 +++++++--- .../reusable/channels/webCommandChannel.ts | 4 +++ .../reusable/commandChannel.ts | 3 ++ .../reusable/trialDispatcher.ts | 1 + 5 files changed, 31 insertions(+), 19 deletions(-) diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index b8f87122f5..2da929cb20 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -3,14 +3,11 @@ 'use strict'; -import * as component from "../../../common/component"; +import { EventEmitter } from 'events'; import { delay } from "../../../common/utils"; -import { CommandChannel, RunnerConnection } from "../commandChannel"; -import { EnvironmentInformation, Channel } from "../environment"; import { AMLEnvironmentInformation } from '../aml/amlConfig'; -import { EventEmitter } from 'events'; -import { AMLEnvironmentService } from "../environments/amlEnvironmentService"; -import { STDOUT } from "../../../core/commands"; +import { CommandChannel, RunnerConnection } from "../commandChannel"; +import { Channel, EnvironmentInformation } from "../environment"; class AMLRunnerConnection extends RunnerConnection { } @@ -19,7 +16,6 @@ export class AMLCommandChannel extends CommandChannel { private stopping: boolean = false; private currentMessageIndex: number = -1; private sendQueues: [EnvironmentInformation, string][] = []; - private metricEmitter: EventEmitter | undefined; private readonly NNI_METRICS_PATTERN: string = `NNISDK_MEb'(?.*?)'`; public constructor(commandEmitter: EventEmitter) { @@ -30,23 +26,25 @@ export class AMLCommandChannel extends CommandChannel { } public async config(_key: string, _value: any): Promise { - switch (_key) { - case "MetricEmitter": - this.metricEmitter = _value as EventEmitter; - break; - } + // do nothing } public async start(): Promise { - // start command loops - this.receiveLoop(); - this.sendLoop(); + // do nothing } public async stop(): Promise { this.stopping = true; } + public async run(): Promise { + // start command loops + await Promise.all([ + this.receiveLoop(), + this.sendLoop() + ]); + } + protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise { this.sendQueues.push([environment, message]); } diff --git a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts index 3c5149603a..02c1a8870a 100644 --- a/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/fileCommandChannel.ts @@ -6,7 +6,7 @@ import * as component from "../../../common/component"; import { delay } from "../../../common/utils"; import { CommandChannel, RunnerConnection } from "../commandChannel"; -import { EnvironmentInformation, Channel } from "../environment"; +import { Channel, EnvironmentInformation } from "../environment"; import { StorageService } from "../storageService"; class FileHandler { @@ -38,15 +38,21 @@ export class FileCommandChannel extends CommandChannel { } public async start(): Promise { - // start command loops - this.receiveLoop(); - this.sendLoop(); + // do nothing } public async stop(): Promise { this.stopping = true; } + public async run(): Promise { + // start command loops + await Promise.all([ + this.receiveLoop(), + this.sendLoop() + ]); + } + protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise { this.sendQueues.push([environment, message]); } diff --git a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts index 774b1a8ffc..3bd9c504aa 100644 --- a/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/webCommandChannel.ts @@ -66,6 +66,10 @@ export class WebCommandChannel extends CommandChannel { } } + public async run(): Promise{ + // do nothing + } + protected async sendCommandInternal(environment: EnvironmentInformation, message: string): Promise { if (this.webSocketServer === undefined) { throw new Error(`WebCommandChannel: uninitialized!`) diff --git a/src/nni_manager/training_service/reusable/commandChannel.ts b/src/nni_manager/training_service/reusable/commandChannel.ts index 80b1ddc5b0..dce405e94c 100644 --- a/src/nni_manager/training_service/reusable/commandChannel.ts +++ b/src/nni_manager/training_service/reusable/commandChannel.ts @@ -59,6 +59,9 @@ export abstract class CommandChannel { public abstract start(): Promise; public abstract stop(): Promise; + // Pull-based command channels need loop to check messages, the loop should be started with await here. + public abstract run(): Promise; + protected abstract sendCommandInternal(environment: EnvironmentInformation, message: string): Promise; protected abstract createRunnerConnection(environment: EnvironmentInformation): RunnerConnection; diff --git a/src/nni_manager/training_service/reusable/trialDispatcher.ts b/src/nni_manager/training_service/reusable/trialDispatcher.ts index 7743fced01..2cce11c760 100644 --- a/src/nni_manager/training_service/reusable/trialDispatcher.ts +++ b/src/nni_manager/training_service/reusable/trialDispatcher.ts @@ -219,6 +219,7 @@ class TrialDispatcher implements TrainingService { await Promise.all([ this.environmentMaintenanceLoop(), this.trialManagementLoop(), + this.commandChannel.run(), ]); } From 0517e13e56aa82a5f643ebc83b88c4563efe03db Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 18:26:38 +0800 Subject: [PATCH 77/98] fix comments --- .../training_service/reusable/environment.ts | 3 --- .../environments/amlEnvironmentService.ts | 22 +++++-------------- 2 files changed, 5 insertions(+), 20 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index 98bcd43323..a3cfd1d742 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -62,9 +62,6 @@ export class EnvironmentInformation { public runnerWorkingFolder: string = ""; public command: string = ""; public nodeCount: number = 1; - // aml related resource, need to refactor - public environmentLocalTempFolder: string = ""; - public environmentClient: any = ""; // it's used to aggregate node status for multiple node trial public nodes: Map; diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 7ce3ae85d7..a836403737 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -66,10 +66,6 @@ export class AMLEnvironmentService extends EnvironmentService { public async config(key: string, value: string): Promise { switch (key) { - case TrialConfigMetadataKey.NNI_MANAGER_IP: - this.nniManagerIpConfig = JSON.parse(value); - break; - case TrialConfigMetadataKey.AML_CLUSTER_CONFIG: this.amlClusterConfig = JSON.parse(value); break; @@ -84,16 +80,8 @@ export class AMLEnvironmentService extends EnvironmentService { await validateCodeDir(this.amlTrialConfig.codeDir); break; } - case TrialConfigMetadataKey.VERSION_CHECK: - this.versionCheck = (value === 'true' || value === 'True'); - this.nniVersion = this.versionCheck ? await getVersion() : ''; - break; - case TrialConfigMetadataKey.MULTI_PHASE: - this.isMultiPhase = (value === 'true' || value === 'True'); - break; default: - //Reject for unknown keys - this.log.error(`Uknown key: ${key}`); + this.log.debug(`AML not proccessed metadata key: '${key}', value: '${value}'`); } } @@ -108,18 +96,18 @@ export class AMLEnvironmentService extends EnvironmentService { case 'WAITING': case 'RUNNING': case 'QUEUED': + // RUNNING status is set by runner, and ignore waiting status break; case 'COMPLETED': - environment.status = 'SUCCEEDED'; case 'SUCCEEDED': - environment.status = 'SUCCEEDED'; + environment.setFinalStatus('SUCCEEDED'); break; case 'FAILED': - environment.status = 'FAILED'; + environment.setFinalStatus('FAILED'); break; case 'STOPPED': case 'STOPPING': - environment.status = 'USER_CANCELED'; + environment.setFinalStatus('USER_CANCELED'); break; default: environment.status = 'UNKNOWN'; From fc4b9785c3187891dd500a29404aafba6c5b035c Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 18:30:44 +0800 Subject: [PATCH 78/98] sort class --- .../training_service/reusable/environment.ts | 50 +++++++++---------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/src/nni_manager/training_service/reusable/environment.ts b/src/nni_manager/training_service/reusable/environment.ts index a3cfd1d742..547ddc43ca 100644 --- a/src/nni_manager/training_service/reusable/environment.ts +++ b/src/nni_manager/training_service/reusable/environment.ts @@ -14,31 +14,6 @@ import { CommandChannel } from "./commandChannel"; export type EnvironmentStatus = 'UNKNOWN' | 'WAITING' | 'RUNNING' | 'SUCCEEDED' | 'FAILED' | 'USER_CANCELED'; export type Channel = "web" | "file" | "aml" | "ut"; -export class NodeInfomation { - public id: string; - public status: TrialJobStatus = "UNKNOWN"; - public endTime?: number; - - constructor(id: string) { - this.id = id; - } -} - -export class RunnerSettings { - public experimentId: string = ""; - public platform: string = ""; - public nniManagerIP: string = ""; - public nniManagerPort: number = 8081; - public nniManagerVersion: string = ""; - public logCollection: string = "none"; - public command: string = ""; - public enableGpuCollector: boolean = false; - - // specify which communication channel is used by runner. - // supported channel includes: rest, storage, aml - public commandChannel: Channel = "file"; -} - export class EnvironmentInformation { private log: Logger; @@ -107,3 +82,28 @@ export abstract class EnvironmentService { return new EnvironmentInformation(envId, envName); } } + +export class NodeInfomation { + public id: string; + public status: TrialJobStatus = "UNKNOWN"; + public endTime?: number; + + constructor(id: string) { + this.id = id; + } +} + +export class RunnerSettings { + public experimentId: string = ""; + public platform: string = ""; + public nniManagerIP: string = ""; + public nniManagerPort: number = 8081; + public nniManagerVersion: string = ""; + public logCollection: string = "none"; + public command: string = ""; + public enableGpuCollector: boolean = false; + + // specify which communication channel is used by runner. + // supported channel includes: rest, storage, aml + public commandChannel: Channel = "file"; +} From e527743d8f112fe702e6bf823f54f988ebd695f7 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 19:37:52 +0800 Subject: [PATCH 79/98] fix eslint --- .../rest_server/restValidationSchemas.ts | 2 +- .../common/containerJobData.ts | 2 +- .../reusable/aml/amlClient.ts | 9 +++---- .../reusable/aml/amlConfig.ts | 1 - .../reusable/channels/amlCommandChannel.ts | 6 ++--- .../environments/amlEnvironmentService.ts | 25 ++++++++----------- 6 files changed, 18 insertions(+), 27 deletions(-) diff --git a/src/nni_manager/rest_server/restValidationSchemas.ts b/src/nni_manager/rest_server/restValidationSchemas.ts index 7cee7611de..302073707e 100644 --- a/src/nni_manager/rest_server/restValidationSchemas.ts +++ b/src/nni_manager/rest_server/restValidationSchemas.ts @@ -152,7 +152,7 @@ export namespace ValidationSchemas { email: joi.string().min(1), password: joi.string().min(1) }), - aml_config: joi.object({ + aml_config: joi.object({ // eslint-disable-line @typescript-eslint/camelcase subscriptionId: joi.string().min(1), resourceGroup: joi.string().min(1), workspaceName: joi.string().min(1) diff --git a/src/nni_manager/training_service/common/containerJobData.ts b/src/nni_manager/training_service/common/containerJobData.ts index f7a29f384a..99690f415c 100644 --- a/src/nni_manager/training_service/common/containerJobData.ts +++ b/src/nni_manager/training_service/common/containerJobData.ts @@ -10,5 +10,5 @@ if python3 -c 'import nni' > /dev/null 2>&1; then return else # Install nni - python3 -m pip install --user --upgrade nni + python3 -m pip install --user --no-cache-dir -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nni==1.63654 fi`; diff --git a/src/nni_manager/training_service/reusable/aml/amlClient.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts index 3786f434f8..cb6ba9a380 100644 --- a/src/nni_manager/training_service/reusable/aml/amlClient.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -3,9 +3,6 @@ 'use strict'; -import * as fs from 'fs'; -import * as request from 'request'; -import * as path from 'path'; import { Deferred } from 'ts-deferred'; import { PythonShell } from 'python-shell'; @@ -82,7 +79,7 @@ export class AMLClient { this.pythonShellClient.send('tracking_url'); let trackingUrl = ''; this.pythonShellClient.on('message', function (status: any) { - let items = status.split(':'); + const items = status.split(':'); if (items[0] === 'tracking_url') { trackingUrl = items.splice(1, items.length).join('') } @@ -99,7 +96,7 @@ export class AMLClient { let newStatus = oldStatus; this.pythonShellClient.send('update_status'); this.pythonShellClient.on('message', function (status: any) { - let items = status.split(':'); + const items = status.split(':'); if (items[0] === 'status') { newStatus = items.splice(1, items.length).join('') } @@ -122,7 +119,7 @@ export class AMLClient { } this.pythonShellClient.send('receive'); this.pythonShellClient.on('message', function (command: any) { - let items = command.split(':') + const items = command.split(':') if (items[0] === 'receive') { deferred.resolve(JSON.parse(command.slice(8))) } diff --git a/src/nni_manager/training_service/reusable/aml/amlConfig.ts b/src/nni_manager/training_service/reusable/aml/amlConfig.ts index de0f3ce2ca..e0099e0cd8 100644 --- a/src/nni_manager/training_service/reusable/aml/amlConfig.ts +++ b/src/nni_manager/training_service/reusable/aml/amlConfig.ts @@ -3,7 +3,6 @@ 'use strict'; -import { TrialJobApplicationForm, TrialJobDetail, TrialJobStatus } from '../../../common/trainingService'; import { TrialConfig } from '../../common/trialConfig'; import { EnvironmentInformation } from '../environment'; import { AMLClient } from '../aml/amlClient'; diff --git a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts index 2da929cb20..d57befd507 100644 --- a/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts +++ b/src/nni_manager/training_service/reusable/channels/amlCommandChannel.ts @@ -93,9 +93,9 @@ export class AMLCommandChannel extends CommandChannel { if (!amlClient) { throw new Error('AML client not initialized!'); } - let command = await amlClient.receiveCommand(); - if (command && command.hasOwnProperty('trial_runner')) { - let messages = command['trial_runner']; + const command = await amlClient.receiveCommand(); + if (command && Object.prototype.hasOwnProperty.call(command, "trial_runner")) { + const messages = command['trial_runner']; if (messages) { if (messages instanceof Object && this.currentMessageIndex < messages.length - 1) { for (let index = this.currentMessageIndex + 1; index < messages.length; index ++) { diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index a836403737..a02c0fd0f6 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -5,7 +5,6 @@ import * as fs from 'fs'; import * as path from 'path'; -import { Deferred } from 'ts-deferred'; import * as component from '../../../common/component'; import { getExperimentId } from '../../../common/experimentStartupInfo'; import { getLogger, Logger } from '../../../common/log'; @@ -15,14 +14,10 @@ import { EnvironmentInformation, EnvironmentService } from '../environment'; import { AMLEnvironmentInformation } from '../aml/amlConfig'; import { AMLClient } from '../aml/amlClient'; import { - NNIManagerIpConfig, TrainingService, - TrialJobApplicationForm, TrialJobDetail, TrialJobMetric + NNIManagerIpConfig, } from '../../../common/trainingService'; -import { execMkdir, validateCodeDir, execCopydir } from '../../common/util'; -import { - delay, generateParamFileName, getExperimentRootDir, getIPV4Address, getJobCancelStatus, - getVersion, uniqueString -} from '../../../common/utils'; +import { validateCodeDir } from '../../common/util'; +import { getExperimentRootDir } from '../../../common/utils'; import { AMLCommandChannel } from '../channels/amlCommandChannel'; import { CommandChannel } from "../commandChannel"; import { EventEmitter } from "events"; @@ -87,11 +82,11 @@ export class AMLEnvironmentService extends EnvironmentService { public async refreshEnvironmentsStatus(environments: EnvironmentInformation[]): Promise { environments.forEach(async (environment) => { - let amlClient = (environment as AMLEnvironmentInformation).amlClient; + const amlClient = (environment as AMLEnvironmentInformation).amlClient; if (!amlClient) { throw new Error('AML client not initialized!'); } - let status = await amlClient.updateStatus(environment.status); + const status = await amlClient.updateStatus(environment.status); switch (status.toUpperCase()) { case 'WAITING': case 'RUNNING': @@ -110,7 +105,7 @@ export class AMLEnvironmentService extends EnvironmentService { environment.setFinalStatus('USER_CANCELED'); break; default: - environment.status = 'UNKNOWN'; + environment.setFinalStatus('UNKNOWN'); } }); } @@ -122,8 +117,8 @@ export class AMLEnvironmentService extends EnvironmentService { if (this.amlTrialConfig === undefined) { throw new Error('AML trial config is not initialized'); } - let amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; - let environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); + const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; + const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); environment.command = `import os\nos.system('${amlEnvironment.command}')`; await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command ,{ encoding: 'utf8' }); let amlClient = new AMLClient( @@ -143,8 +138,8 @@ export class AMLEnvironmentService extends EnvironmentService { } public async stopEnvironment(environment: EnvironmentInformation): Promise { - let amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; - let amlClient = amlEnvironment.amlClient; + const amlEnvironment: AMLEnvironmentInformation = environment as AMLEnvironmentInformation; + const amlClient = amlEnvironment.amlClient; if (!amlClient) { throw new Error('AML client not initialized!'); } From b047681c570300b87867fb7d0d030600016bc86f Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 20:04:37 +0800 Subject: [PATCH 80/98] fix eslint --- src/nni_manager/training_service/common/containerJobData.ts | 2 +- .../reusable/environments/amlEnvironmentService.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/nni_manager/training_service/common/containerJobData.ts b/src/nni_manager/training_service/common/containerJobData.ts index 99690f415c..f7a29f384a 100644 --- a/src/nni_manager/training_service/common/containerJobData.ts +++ b/src/nni_manager/training_service/common/containerJobData.ts @@ -10,5 +10,5 @@ if python3 -c 'import nni' > /dev/null 2>&1; then return else # Install nni - python3 -m pip install --user --no-cache-dir -i https://test.pypi.org/simple/ --extra-index-url https://pypi.org/simple nni==1.63654 + python3 -m pip install --user --upgrade nni fi`; diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index a02c0fd0f6..973ecb7555 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -121,7 +121,7 @@ export class AMLEnvironmentService extends EnvironmentService { const environmentLocalTempFolder = path.join(this.experimentRootDir, this.experimentId, "environment-temp"); environment.command = `import os\nos.system('${amlEnvironment.command}')`; await fs.promises.writeFile(path.join(environmentLocalTempFolder, 'nni_script.py'), amlEnvironment.command ,{ encoding: 'utf8' }); - let amlClient = new AMLClient( + const amlClient = new AMLClient( this.amlClusterConfig.subscriptionId, this.amlClusterConfig.resourceGroup, this.amlClusterConfig.workspaceName, From 4acc7e87be3403fe17c45c3abb3ab66c17cd2239 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Tue, 30 Jun 2020 20:07:59 +0800 Subject: [PATCH 81/98] fix annotation --- tools/nni_trial_tool/aml_channel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/nni_trial_tool/aml_channel.py b/tools/nni_trial_tool/aml_channel.py index 61786dc25a..3e52c871ae 100644 --- a/tools/nni_trial_tool/aml_channel.py +++ b/tools/nni_trial_tool/aml_channel.py @@ -29,7 +29,6 @@ def _inner_send(self, message): def _inner_receive(self): messages = [] - # receive message is string, to get consistent result, encode it here. message_dict = self.run.get_metrics() if 'nni_manager' not in message_dict: return [] @@ -45,5 +44,6 @@ def _inner_receive(self): self.current_message_index += 1 newMessage = [] for message in messages: + # receive message is string, to get consistent result, encode it here. newMessage.append(message.encode('utf8')) return newMessage From ec1475a45a3c1413e9c3286cb2c5ab762017616d Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 07:46:39 +0800 Subject: [PATCH 82/98] fix import aml --- tools/nni_trial_tool/trial_runner.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 5edd39c28c..254dcfaa0e 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -188,8 +188,9 @@ def check_version(args): from .trial import Trial from .file_channel import FileChannel from .web_channel import WebChannel - from .aml_channel import AMLChannel from .commands import CommandType + if args.platform == 'aml': + from .aml_channel import AMLChannel is_multi_node = args.node_count > 1 From 8eaeebfc07420c7a986875a220626bc7cc6b1518 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 07:55:44 +0800 Subject: [PATCH 83/98] fix comments --- docs/en_US/TrainingService/AMLMode.md | 8 ++++---- examples/trials/mnist-pytorch/config_aml.yml | 2 +- examples/trials/mnist-tfv1/config_aml.yml | 4 ++-- src/nni_manager/config/aml/amlUtil.py | 3 +-- .../training_service/reusable/aml/amlClient.ts | 6 +----- .../training_service/reusable/aml/amlConfig.ts | 6 ++---- .../reusable/environments/amlEnvironmentService.ts | 1 - tools/nni_cmd/config_schema.py | 3 +-- 8 files changed, 12 insertions(+), 21 deletions(-) diff --git a/docs/en_US/TrainingService/AMLMode.md b/docs/en_US/TrainingService/AMLMode.md index 30c748adfb..6946329657 100644 --- a/docs/en_US/TrainingService/AMLMode.md +++ b/docs/en_US/TrainingService/AMLMode.md @@ -39,8 +39,8 @@ tuner: trial: command: python3 mnist.py codeDir: . - computeTarget: ussc40rscl - nodeCount: 1 + computeTarget: ${replace_to_your_computeTarget} + image: msranni/nni amlConfig: subscriptionId: ${replace_to_your_subscriptionId} resourceGroup: ${replace_to_your_resourceGroup} @@ -53,8 +53,8 @@ Note: You should set `trainingServicePlatform: aml` in NNI config YAML file if y Compared with [LocalMode](LocalMode.md) trial configuration in aml mode have these additional keys: * computeTarget * required key. The computer cluster name you want to use in your AML workspace. -* nodeCount - * required key. The number of nodes to use for one run. [refer](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.runconfiguration?view=azure-ml-py#variables) +* image + * required key. The docker image name used in job. amlConfig: * subscriptionId diff --git a/examples/trials/mnist-pytorch/config_aml.yml b/examples/trials/mnist-pytorch/config_aml.yml index be0c31c93c..ef61646029 100644 --- a/examples/trials/mnist-pytorch/config_aml.yml +++ b/examples/trials/mnist-pytorch/config_aml.yml @@ -18,7 +18,7 @@ trial: command: python3 mnist.py codeDir: . computeTarget: ussc40rscl - nodeCount: 1 + image: msranni/nni amlConfig: subscriptionId: ${replace_to_your_subscriptionId} resourceGroup: ${replace_to_your_resourceGroup} diff --git a/examples/trials/mnist-tfv1/config_aml.yml b/examples/trials/mnist-tfv1/config_aml.yml index 1f1f471e77..6a556a2f4e 100644 --- a/examples/trials/mnist-tfv1/config_aml.yml +++ b/examples/trials/mnist-tfv1/config_aml.yml @@ -17,8 +17,8 @@ tuner: trial: command: python3 mnist.py codeDir: . - computeTarget: ussc40rscl - nodeCount: 1 + computeTarget: ${replace_to_your_computeTarget} + image: msranni/nni amlConfig: subscriptionId: ${replace_to_your_subscriptionId} resourceGroup: ${replace_to_your_resourceGroup} diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index 16099dbc74..527388ea25 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -22,7 +22,6 @@ parser.add_argument('--experiment_name', help='the experiment name') parser.add_argument('--script_dir', help='script directory') parser.add_argument('--script_name', help='script name') - parser.add_argument('--node_count', help='node count of run') args = parser.parse_args() ws = Workspace(args.subscription_id, args.resource_group, args.workspace_name) @@ -36,7 +35,7 @@ run_config.environment.docker.enabled = True run_config.environment.docker.base_image = args.docker_image run_config.target = compute_target - run_config.node_count = args.node_count + run_config.node_count = 1 config = ScriptRunConfig(source_directory=args.script_dir, script=args.script_name, run_config=run_config) run = experiment.submit(config) print(run.get_details()["runId"]) diff --git a/src/nni_manager/training_service/reusable/aml/amlClient.ts b/src/nni_manager/training_service/reusable/aml/amlClient.ts index cb6ba9a380..3fdf25eee4 100644 --- a/src/nni_manager/training_service/reusable/aml/amlClient.ts +++ b/src/nni_manager/training_service/reusable/aml/amlClient.ts @@ -15,7 +15,6 @@ export class AMLClient { public scriptName: string; public pythonShellClient: undefined | PythonShell; public codeDir: string; - public nodeCount: number; public computeTarget: string; constructor( @@ -24,7 +23,6 @@ export class AMLClient { workspaceName: string, experimentId: string, computeTarget: string, - nodeCount: number, image: string, scriptName: string, codeDir: string, @@ -34,7 +32,6 @@ export class AMLClient { this.workspaceName = workspaceName; this.experimentId = experimentId; this.image = image; - this.nodeCount = nodeCount; this.scriptName = scriptName; this.codeDir = codeDir; this.computeTarget = computeTarget; @@ -53,8 +50,7 @@ export class AMLClient { '--docker_image', this.image, '--experiment_name', `nni_exp_${this.experimentId}`, '--script_dir', this.codeDir, - '--script_name', this.scriptName, - '--node_count', this.nodeCount.toString() + '--script_name', this.scriptName ] }); this.pythonShellClient.on('message', function (envId: any) { diff --git a/src/nni_manager/training_service/reusable/aml/amlConfig.ts b/src/nni_manager/training_service/reusable/aml/amlConfig.ts index e0099e0cd8..2c101883e1 100644 --- a/src/nni_manager/training_service/reusable/aml/amlConfig.ts +++ b/src/nni_manager/training_service/reusable/aml/amlConfig.ts @@ -23,19 +23,17 @@ export class AMLTrialConfig extends TrialConfig { public readonly image: string; public readonly command: string; public readonly codeDir: string; - public readonly nodeCount: number; public readonly computeTarget: string; - constructor(codeDir: string, command: string, image: string, nodeCount: number, computeTarget: string) { + constructor(codeDir: string, command: string, image: string, computeTarget: string) { super("", codeDir, 0); this.codeDir = codeDir; this.command = command; this.image = image; - this.nodeCount = nodeCount; this.computeTarget = computeTarget; } } export class AMLEnvironmentInformation extends EnvironmentInformation { - public amlClient?: AMLClient; + public amlClient?: AMLClient; } diff --git a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts index 973ecb7555..7b17c5cc4d 100644 --- a/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts +++ b/src/nni_manager/training_service/reusable/environments/amlEnvironmentService.ts @@ -127,7 +127,6 @@ export class AMLEnvironmentService extends EnvironmentService { this.amlClusterConfig.workspaceName, this.experimentId, this.amlTrialConfig.computeTarget, - this.amlTrialConfig.nodeCount, this.amlTrialConfig.image, 'nni_script.py', environmentLocalTempFolder diff --git a/tools/nni_cmd/config_schema.py b/tools/nni_cmd/config_schema.py index cc96682dbc..b8bed07fb9 100644 --- a/tools/nni_cmd/config_schema.py +++ b/tools/nni_cmd/config_schema.py @@ -239,8 +239,7 @@ def validate(self, data): 'codeDir': setPathCheck('codeDir'), 'command': setType('command', str), 'image': setType('image', str), - 'computeTarget': setType('computeTarget', str), - 'nodeCount': setType('nodeCount', int) + 'computeTarget': setType('computeTarget', str) } } From 56b68185ab703ad2bb5e66b38d2f730b4e8d6672 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 08:04:42 +0800 Subject: [PATCH 84/98] fix doc build --- docs/en_US/training_services.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en_US/training_services.rst b/docs/en_US/training_services.rst index 435abc0c26..71b2cf6f8c 100644 --- a/docs/en_US/training_services.rst +++ b/docs/en_US/training_services.rst @@ -10,3 +10,4 @@ Introduction to NNI Training Services Kubeflow<./TrainingService/KubeflowMode> FrameworkController<./TrainingService/FrameworkControllerMode> DLTS<./TrainingService/DLTSMode> + AML<./TrainingService/AMLMode> From e09ff79356f03804b09b845aaf87c524794aada0 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 08:05:54 +0800 Subject: [PATCH 85/98] fix trial_runner import --- tools/nni_trial_tool/trial_runner.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/nni_trial_tool/trial_runner.py b/tools/nni_trial_tool/trial_runner.py index 254dcfaa0e..5de7f0c83e 100644 --- a/tools/nni_trial_tool/trial_runner.py +++ b/tools/nni_trial_tool/trial_runner.py @@ -189,8 +189,6 @@ def check_version(args): from .file_channel import FileChannel from .web_channel import WebChannel from .commands import CommandType - if args.platform == 'aml': - from .aml_channel import AMLChannel is_multi_node = args.node_count > 1 @@ -213,6 +211,7 @@ def check_version(args): if args.command_channel == "file": command_channel = FileChannel(args) elif args.command_channel == 'aml': + from .aml_channel import AMLChannel command_channel = AMLChannel(args) else: command_channel = WebChannel(args) From ecf615d4ab3a4c2b35d7acc561ea557f6d513794 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 08:12:30 +0800 Subject: [PATCH 86/98] fix doc --- examples/trials/mnist-pytorch/config_aml.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/trials/mnist-pytorch/config_aml.yml b/examples/trials/mnist-pytorch/config_aml.yml index ef61646029..883a00340e 100644 --- a/examples/trials/mnist-pytorch/config_aml.yml +++ b/examples/trials/mnist-pytorch/config_aml.yml @@ -17,7 +17,7 @@ tuner: trial: command: python3 mnist.py codeDir: . - computeTarget: ussc40rscl + computeTarget: ${replace_to_your_computeTarget} image: msranni/nni amlConfig: subscriptionId: ${replace_to_your_subscriptionId} From a7a3baf35f445c9e49de60003a95fee327548bd0 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 08:32:41 +0800 Subject: [PATCH 87/98] fix pylint --- tools/nni_trial_tool/aml_channel.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/nni_trial_tool/aml_channel.py b/tools/nni_trial_tool/aml_channel.py index 3e52c871ae..c8e1d7484a 100644 --- a/tools/nni_trial_tool/aml_channel.py +++ b/tools/nni_trial_tool/aml_channel.py @@ -1,9 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT license. -import websockets -import json -from azureml.core.run import Run +from azureml.core.run import Run # pylint: disable=import-error from .base_channel import BaseChannel from .log_utils import LogType, nni_log @@ -40,7 +38,7 @@ def _inner_receive(self): messages = message_list[self.current_message_index + 1 : len(message_list)] self.current_message_index = len(message_list) - 1 elif self.current_message_index == -1: - messages = [message_list] + messages = [message_list] self.current_message_index += 1 newMessage = [] for message in messages: From 7eaa1058110cdd8e68132b63c329c9f1a4b81a1c Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 11:14:42 +0800 Subject: [PATCH 88/98] add doc for aml --- README.md | 1 + docs/en_US/TrainingService/Overview.md | 1 + 2 files changed, 2 insertions(+) diff --git a/README.md b/README.md index a9dd388482..8b8fa4a9d0 100644 --- a/README.md +++ b/README.md @@ -170,6 +170,7 @@ Within the following table, we summarized the current NNI capabilities, we are g

    • Local Machine
    • Remote Servers
    • +
    • AML(Azure Machine Learning)
    • Kubernetes based services
      • OpenPAI
      • Kubeflow
      • diff --git a/docs/en_US/TrainingService/Overview.md b/docs/en_US/TrainingService/Overview.md index 77e46fafdf..1ced708b92 100644 --- a/docs/en_US/TrainingService/Overview.md +++ b/docs/en_US/TrainingService/Overview.md @@ -26,6 +26,7 @@ In case users intend to use large files in their experiment (like large-scaled d |[__Kubeflow__](./KubeflowMode.html)|NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a Kubernetes cluster, either on-premises or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your Kubernetes cluster. If you are not familiar with Kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a good start. In kubeflow mode, your trial program will run as Kubeflow job in Kubernetes cluster.| |[__FrameworkController__](./FrameworkControllerMode.html)|NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install Kubeflow for specific deep learning framework like tf-operator or pytorch-operator. Now you can use FrameworkController as the training service to run NNI experiment.| |[__DLTS__](./DLTSMode.html)|NNI supports running experiment using [DLTS](https://github.com/microsoft/DLWorkspace.git), which is an open source toolkit, developed by Microsoft, that allows AI scientists to spin up an AI cluster in turn-key fashion.| +|[__AML__](./AMLMode.html)|NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) , called aml mode. ## What does Training Service do? From a0ea554e2eaa0428d7e5c5239753597e73c5dbe3 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 11:31:30 +0800 Subject: [PATCH 89/98] add content --- docs/en_US/TrainingService/Overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en_US/TrainingService/Overview.md b/docs/en_US/TrainingService/Overview.md index 1ced708b92..ecdb3014bd 100644 --- a/docs/en_US/TrainingService/Overview.md +++ b/docs/en_US/TrainingService/Overview.md @@ -4,7 +4,7 @@ NNI training service is designed to allow users to focus on AutoML itself, agnostic to the underlying computing infrastructure where the trials are actually run. When migrating from one cluster to another (e.g., local machine to Kubeflow), users only need to tweak several configurations, and the experiment can be easily scaled. -Users can use training service provided by NNI, to run trial jobs on [local machine](./LocalMode.md), [remote machines](./RemoteMachineMode.md), and on clusters like [PAI](./PaiMode.md), [Kubeflow](./KubeflowMode.md) and [FrameworkController](./FrameworkControllerMode.md). These are called *built-in training services*. +Users can use training service provided by NNI, to run trial jobs on [local machine](./LocalMode.md), [remote machines](./RemoteMachineMode.md), and on clusters like [AML](./AMLMode.md), [PAI](./PaiMode.md), [Kubeflow](./KubeflowMode.md) and [FrameworkController](./FrameworkControllerMode.md). These are called *built-in training services*. If the computing resource customers try to use is not listed above, NNI provides interface that allows users to build their own training service easily. Please refer to "[how to implement training service](./HowToImplementTrainingService)" for details. From 972822c8163ea8d4105b7148d46a537a86655349 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 11:36:58 +0800 Subject: [PATCH 90/98] supplement dlts doc --- docs/en_US/TrainingService/Overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en_US/TrainingService/Overview.md b/docs/en_US/TrainingService/Overview.md index ecdb3014bd..3c99dc6f5c 100644 --- a/docs/en_US/TrainingService/Overview.md +++ b/docs/en_US/TrainingService/Overview.md @@ -4,7 +4,7 @@ NNI training service is designed to allow users to focus on AutoML itself, agnostic to the underlying computing infrastructure where the trials are actually run. When migrating from one cluster to another (e.g., local machine to Kubeflow), users only need to tweak several configurations, and the experiment can be easily scaled. -Users can use training service provided by NNI, to run trial jobs on [local machine](./LocalMode.md), [remote machines](./RemoteMachineMode.md), and on clusters like [AML](./AMLMode.md), [PAI](./PaiMode.md), [Kubeflow](./KubeflowMode.md) and [FrameworkController](./FrameworkControllerMode.md). These are called *built-in training services*. +Users can use training service provided by NNI, to run trial jobs on [local machine](./LocalMode.md), [remote machines](./RemoteMachineMode.md), and on clusters like [PAI](./PaiMode.md), [Kubeflow](./KubeflowMode.md), [FrameworkController](./FrameworkControllerMode.md), [DLTS](./DLTSMode.md) and [AML](./AMLMode.md). These are called *built-in training services*. If the computing resource customers try to use is not listed above, NNI provides interface that allows users to build their own training service easily. Please refer to "[how to implement training service](./HowToImplementTrainingService)" for details. From b5f7f060495f03ae727ce0078351eab9ffc0f3de Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 11:40:23 +0800 Subject: [PATCH 91/98] add doc content --- README.md | 2 +- docs/en_US/_templates/index.html | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8b8fa4a9d0..f25cce8835 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ **NNI (Neural Network Intelligence)** is a lightweight but powerful toolkit to help users **automate** Feature Engineering, Neural Architecture Search, Hyperparameter Tuning and Model Compression. -The tool manages automated machine learning (AutoML) experiments, **dispatches and runs** experiments' trial jobs generated by tuning algorithms to search the best neural architecture and/or hyper-parameters in **different training environments** like Local Machine, Remote Servers, OpenPAI, Kubeflow, FrameworkController on K8S (AKS etc.), DLWorkspace (aka. DLTS) and other cloud options. +The tool manages automated machine learning (AutoML) experiments, **dispatches and runs** experiments' trial jobs generated by tuning algorithms to search the best neural architecture and/or hyper-parameters in **different training environments** like Local Machine, Remote Servers, OpenPAI, Kubeflow, FrameworkController on K8S (AKS etc.), DLWorkspace (aka. DLTS), AML (Azure Machine Learning) and other cloud options. ## **Who should consider using NNI** diff --git a/docs/en_US/_templates/index.html b/docs/en_US/_templates/index.html index 5cc8257298..7a405267c3 100644 --- a/docs/en_US/_templates/index.html +++ b/docs/en_US/_templates/index.html @@ -25,6 +25,7 @@ Kubeflow, FrameworkController on K8S (AKS etc.) DLWorkspace (aka. DLTS) + AML (Azure Machine Learning) and other cloud options.

        @@ -197,6 +198,7 @@

        NNI capabilities in a glance

        (AKSetc.)
      • DLWorkspace (aka. DLTS)
      • +
      • AML (Azure Machine Learning)
    From f6d9c3fde006e7bfc47e03c0ccd1aee47ec1cc58 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 11:42:44 +0800 Subject: [PATCH 92/98] fix content --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f25cce8835..60f6a5bc21 100644 --- a/README.md +++ b/README.md @@ -170,13 +170,13 @@ Within the following table, we summarized the current NNI capabilities, we are g
    • Local Machine
    • Remote Servers
    • -
    • AML(Azure Machine Learning)
    • Kubernetes based services
    • From f39036fd2f919a286a43d48470357b63039f0f6c Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 11:44:12 +0800 Subject: [PATCH 93/98] fix content --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 60f6a5bc21..b65e3f4418 100644 --- a/README.md +++ b/README.md @@ -170,13 +170,13 @@ Within the following table, we summarized the current NNI capabilities, we are g
      • Local Machine
      • Remote Servers
      • +
      • AML(Azure Machine Learning)
      • Kubernetes based services
      • From 5e366cf853d9cceef36085c807005c5da0814c68 Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Wed, 1 Jul 2020 11:48:48 +0800 Subject: [PATCH 94/98] fix broken link --- docs/en_US/TrainingService/Overview.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/en_US/TrainingService/Overview.md b/docs/en_US/TrainingService/Overview.md index 3c99dc6f5c..c75cfc7a58 100644 --- a/docs/en_US/TrainingService/Overview.md +++ b/docs/en_US/TrainingService/Overview.md @@ -20,13 +20,13 @@ In case users intend to use large files in their experiment (like large-scaled d |TrainingService|Brief Introduction| |---|---| -|[__Local__](./LocalMode.html)|NNI supports running an experiment on local machine, called local mode. Local mode means that NNI will run the trial jobs and nniManager process in same machine, and support gpu schedule function for trial jobs.| -|[__Remote__](./RemoteMachineMode.html)|NNI supports running an experiment on multiple machines through SSH channel, called remote mode. NNI assumes that you have access to those machines, and already setup the environment for running deep learning training code. NNI will submit the trial jobs in remote machine, and schedule suitable machine with enough gpu resource if specified.| -|[__PAI__](./PaiMode.html)|NNI supports running an experiment on [OpenPAI](https://github.com/Microsoft/pai) (aka PAI), called PAI mode. Before starting to use NNI PAI mode, you should have an account to access an [OpenPAI](https://github.com/Microsoft/pai) cluster. See [here](https://github.com/Microsoft/pai#how-to-deploy) if you don't have any OpenPAI account and want to deploy an OpenPAI cluster. In PAI mode, your trial program will run in PAI's container created by Docker.| -|[__Kubeflow__](./KubeflowMode.html)|NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a Kubernetes cluster, either on-premises or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your Kubernetes cluster. If you are not familiar with Kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a good start. In kubeflow mode, your trial program will run as Kubeflow job in Kubernetes cluster.| -|[__FrameworkController__](./FrameworkControllerMode.html)|NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install Kubeflow for specific deep learning framework like tf-operator or pytorch-operator. Now you can use FrameworkController as the training service to run NNI experiment.| -|[__DLTS__](./DLTSMode.html)|NNI supports running experiment using [DLTS](https://github.com/microsoft/DLWorkspace.git), which is an open source toolkit, developed by Microsoft, that allows AI scientists to spin up an AI cluster in turn-key fashion.| -|[__AML__](./AMLMode.html)|NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) , called aml mode. +|[__Local__](./LocalMode.md)|NNI supports running an experiment on local machine, called local mode. Local mode means that NNI will run the trial jobs and nniManager process in same machine, and support gpu schedule function for trial jobs.| +|[__Remote__](./RemoteMachineMode.md)|NNI supports running an experiment on multiple machines through SSH channel, called remote mode. NNI assumes that you have access to those machines, and already setup the environment for running deep learning training code. NNI will submit the trial jobs in remote machine, and schedule suitable machine with enough gpu resource if specified.| +|[__PAI__](./PaiMode.md)|NNI supports running an experiment on [OpenPAI](https://github.com/Microsoft/pai) (aka PAI), called PAI mode. Before starting to use NNI PAI mode, you should have an account to access an [OpenPAI](https://github.com/Microsoft/pai) cluster. See [here](https://github.com/Microsoft/pai#how-to-deploy) if you don't have any OpenPAI account and want to deploy an OpenPAI cluster. In PAI mode, your trial program will run in PAI's container created by Docker.| +|[__Kubeflow__](./KubeflowMode.md)|NNI supports running experiment on [Kubeflow](https://github.com/kubeflow/kubeflow), called kubeflow mode. Before starting to use NNI kubeflow mode, you should have a Kubernetes cluster, either on-premises or [Azure Kubernetes Service(AKS)](https://azure.microsoft.com/en-us/services/kubernetes-service/), a Ubuntu machine on which [kubeconfig](https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/) is setup to connect to your Kubernetes cluster. If you are not familiar with Kubernetes, [here](https://kubernetes.io/docs/tutorials/kubernetes-basics/) is a good start. In kubeflow mode, your trial program will run as Kubeflow job in Kubernetes cluster.| +|[__FrameworkController__](./FrameworkControllerMode.md)|NNI supports running experiment using [FrameworkController](https://github.com/Microsoft/frameworkcontroller), called frameworkcontroller mode. FrameworkController is built to orchestrate all kinds of applications on Kubernetes, you don't need to install Kubeflow for specific deep learning framework like tf-operator or pytorch-operator. Now you can use FrameworkController as the training service to run NNI experiment.| +|[__DLTS__](./DLTSMode.md)|NNI supports running experiment using [DLTS](https://github.com/microsoft/DLWorkspace.git), which is an open source toolkit, developed by Microsoft, that allows AI scientists to spin up an AI cluster in turn-key fashion.| +|[__AML__](./AMLMode.md)|NNI supports running an experiment on [AML](https://azure.microsoft.com/en-us/services/machine-learning/) , called aml mode. ## What does Training Service do? From a767b795a0210d9eea937849349ce4ff2c2c3ff2 Mon Sep 17 00:00:00 2001 From: chicm-ms <38930155+chicm-ms@users.noreply.github.com> Date: Wed, 1 Jul 2020 14:01:31 +0800 Subject: [PATCH 95/98] Fix win32 build (#2623) --- src/webui/package.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/webui/package.json b/src/webui/package.json index 73ecb76767..9afe3d48ee 100644 --- a/src/webui/package.json +++ b/src/webui/package.json @@ -77,9 +77,9 @@ "typescript": "3.4.5" }, "scripts": { - "start": "node --max-old-space-size=4096 scripts/start.js", - "build": "node --max-old-space-size=4096 scripts/build.js", - "test": "node --max-old-space-size=4096 scripts/test.js", + "start": "node --max-old-space-size=3072 scripts/start.js", + "build": "node --max-old-space-size=3072 scripts/build.js", + "test": "node --max-old-space-size=3072 scripts/test.js", "eslint": "npx eslint ./ --ext .tsx,.ts" }, "eslintConfig": { From b3ec35a553d67af2dea6d74693dc4f070ed5de3e Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 3 Jul 2020 16:16:27 +0800 Subject: [PATCH 96/98] fix aml docker image --- deployment/docker/Dockerfile | 12 ++++++++++++ src/nni_manager/config/aml/amlUtil.py | 5 +---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/deployment/docker/Dockerfile b/deployment/docker/Dockerfile index 74835358bf..1f3b75b519 100644 --- a/deployment/docker/Dockerfile +++ b/deployment/docker/Dockerfile @@ -29,6 +29,11 @@ RUN DEBIAN_FRONTEND=noninteractive && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* +# +# generate python script +# +RUN cp /usr/bin/python3 /usr/bin/python + # # update pip # @@ -69,6 +74,13 @@ RUN python3 -m pip --no-cache-dir install pandas==0.23.4 lightgbm==2.2.2 # RUN python3 -m pip --no-cache-dir install nni +# +# install aml package +# +RUN python3 -m pip --no-cache-dir install azureml +RUN python3 -m pip --no-cache-dir install azureml-sdk + + ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/bin:/sbin WORKDIR /root diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index 527388ea25..ca2a5e51a0 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -28,10 +28,7 @@ compute_target = ComputeTarget(workspace=ws, name=args.compute_target) experiment = Experiment(ws, args.experiment_name) run_config = RunConfiguration() - dependencies = CondaDependencies() - dependencies.add_pip_package("azureml-sdk") - dependencies.add_pip_package("azureml") - run_config.environment.python.conda_dependencies = dependencies + run_config.environment.python.user_managed_dependencies = True run_config.environment.docker.enabled = True run_config.environment.docker.base_image = args.docker_image run_config.target = compute_target From c3bc4f51c63aa4f399989b6aae7753b4a37f301a Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 3 Jul 2020 16:40:09 +0800 Subject: [PATCH 97/98] revert change --- deployment/docker/Dockerfile | 11 ----------- src/nni_manager/config/aml/amlUtil.py | 5 ++++- src/webui/package.json | 6 +++--- 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/deployment/docker/Dockerfile b/deployment/docker/Dockerfile index 1f3b75b519..b82818b25b 100644 --- a/deployment/docker/Dockerfile +++ b/deployment/docker/Dockerfile @@ -29,11 +29,6 @@ RUN DEBIAN_FRONTEND=noninteractive && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# -# generate python script -# -RUN cp /usr/bin/python3 /usr/bin/python - # # update pip # @@ -74,12 +69,6 @@ RUN python3 -m pip --no-cache-dir install pandas==0.23.4 lightgbm==2.2.2 # RUN python3 -m pip --no-cache-dir install nni -# -# install aml package -# -RUN python3 -m pip --no-cache-dir install azureml -RUN python3 -m pip --no-cache-dir install azureml-sdk - ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/bin:/sbin diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index ca2a5e51a0..11a004f30a 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -28,7 +28,10 @@ compute_target = ComputeTarget(workspace=ws, name=args.compute_target) experiment = Experiment(ws, args.experiment_name) run_config = RunConfiguration() - run_config.environment.python.user_managed_dependencies = True + dependencies = CondaDependencies() + dependencies.add_pip_package("azureml-sdk") + dependencies.add_pip_package("azureml") + run_config.environment.python.conda_dependencies = dependencies run_config.environment.docker.enabled = True run_config.environment.docker.base_image = args.docker_image run_config.target = compute_target diff --git a/src/webui/package.json b/src/webui/package.json index 9afe3d48ee..73ecb76767 100644 --- a/src/webui/package.json +++ b/src/webui/package.json @@ -77,9 +77,9 @@ "typescript": "3.4.5" }, "scripts": { - "start": "node --max-old-space-size=3072 scripts/start.js", - "build": "node --max-old-space-size=3072 scripts/build.js", - "test": "node --max-old-space-size=3072 scripts/test.js", + "start": "node --max-old-space-size=4096 scripts/start.js", + "build": "node --max-old-space-size=4096 scripts/build.js", + "test": "node --max-old-space-size=4096 scripts/test.js", "eslint": "npx eslint ./ --ext .tsx,.ts" }, "eslintConfig": { From 8c5f47120db773288dc1a07b29d74a100bc8819a Mon Sep 17 00:00:00 2001 From: Shinai Yang Date: Fri, 3 Jul 2020 16:41:32 +0800 Subject: [PATCH 98/98] format --- deployment/docker/Dockerfile | 1 - src/nni_manager/config/aml/amlUtil.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/deployment/docker/Dockerfile b/deployment/docker/Dockerfile index b82818b25b..74835358bf 100644 --- a/deployment/docker/Dockerfile +++ b/deployment/docker/Dockerfile @@ -69,7 +69,6 @@ RUN python3 -m pip --no-cache-dir install pandas==0.23.4 lightgbm==2.2.2 # RUN python3 -m pip --no-cache-dir install nni - ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/root/.local/bin:/usr/bin:/bin:/sbin WORKDIR /root diff --git a/src/nni_manager/config/aml/amlUtil.py b/src/nni_manager/config/aml/amlUtil.py index 11a004f30a..527388ea25 100644 --- a/src/nni_manager/config/aml/amlUtil.py +++ b/src/nni_manager/config/aml/amlUtil.py @@ -29,8 +29,8 @@ experiment = Experiment(ws, args.experiment_name) run_config = RunConfiguration() dependencies = CondaDependencies() - dependencies.add_pip_package("azureml-sdk") - dependencies.add_pip_package("azureml") + dependencies.add_pip_package("azureml-sdk") + dependencies.add_pip_package("azureml") run_config.environment.python.conda_dependencies = dependencies run_config.environment.docker.enabled = True run_config.environment.docker.base_image = args.docker_image