From 7c63eaa5c7d98f6154c9d7cbb33b7bf33d1eb235 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 18 Dec 2017 14:34:54 +0800 Subject: [PATCH 01/20] Add profiler design documentation. --- doc/design/images/profiler.png | Bin 0 -> 51116 bytes doc/design/profiler.md | 95 +++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 doc/design/images/profiler.png create mode 100644 doc/design/profiler.md diff --git a/doc/design/images/profiler.png b/doc/design/images/profiler.png new file mode 100644 index 0000000000000000000000000000000000000000..d57b71ca88aaba5d05584a6219d84214e285a1e1 GIT binary patch literal 51116 zcmeFZbx>Vh^DYPk2p$N*-QC^Y9fG^N%fThMyL<2icZc8(!QC~u6ZCH0@BMxE&QwiJ z&HqCcbWrA3=AR}3=Dh-1_CIF8x0W!KER!o zC4|8$CvcB|2UrJ5O=mDLc+~e_aIlOlOrXM?2cR|> z7_U1w@YUAT#gNe5*2d16+nta2pBmi2_xEB3V#0r_xLETMYse`Qir71u60*^=(lZkC z!x0h^@;aHAaVv|8|JxjR;v=?jadF^gU;u$Y^dJ^`dna=SCN3^621aHEW@b8|2A#8q zor|G6ot-nue>(Y}end^3jh!qVTrBPF2;ciPG_rSf;Ugw~ALze7|Cy(YrP==;$fCn6FnotfBOcS^1hdHi`d&bIGH*-1O4-}@%~fu|5En9$N5ivMGJcudq4_K zmc~+cE~ZXEV;95s+3_>~+xq{x#{a!91t&{WVAcP$X8O1F|5^5LeO`w5h5sLd_>YwT zDFwvL568>!-!0>ZtKMN=0|OHTlM)sB>JEOg21>IPOXhgRV2QwjL?ep|$O|KsL~hP2 zd?F4@P!f$RMT!6y&-KXqjpBpbdyKG2nQ-0FLx$*t5{FC5LYzm5)$gBH_sqXD{oVH| zh`+=>_sm(o>RJMHx{TK4b^ch{V5K@|6wCj3_%^Ih90x-R5iSUPM16sX#VMONGe!*% zB}9>g{2U;<>4zfuI&uH+6CnyFA7nz8#Cf`DfW03g?E1; ziF>^N@*$(23?jT+lJKJ3obR}0(&uc_@71}2?|#*pYMa65NQR+#^IM6)%h8Hm+o^)@ zRbrLTsrCIDXk&-q?Ey_qBVfC;@3V|(m+#xtMw{zFHt6+8!B5NQb}m)V^JvAs%jdml z0@UfeWSV-~xMBxP{_SP$-1k0^;5jb|9)te*uBq$gr~Zp{jdfMW-2je#IdY2kCC_R7 zOuBiF$0Xmq#Q^(?&GjyYz~7U)$`Bkg^!Gn<`@lGPqBWBt4zwn7D-mL}J2>c(Fm1_glW6@}t^9)f?w z6%2eE)qi!`jh8TbLF8DykW%oRW?Rs4o{{4SS08%s!r3Yn2Fu+WXC_4^S? zo|1U``rqR1yTp4HP3sYxmrXLx7k{*@D;k34Tvr_jZL5DlYumLH$&mSN!Rxyp=7%P9 z<;Qo?0}=&+Q`o=j#RH+>Pw~Vefs+GZFJG=k83>q$oH}mjBN=u7HdOi8#x-G5QSA|5 z-s6E*1N2{r$@bljKMFhu={&X#lV^$!Q1EWS61*Hrki5oA;O8fy623kk2&5+cEU%Rg z%Ds#-hbeFU^TQFVUblo+lQ|A%78rWL!sOk)44?;?rs}!@sup{uDX^~&N)FG2(vFg{ zwqo7!@VV!j^q2s&kS=7eJs&;Q<5IF~jCnLEs{T2Wq4VKspYs$ckNdE9v=x_FGAm*{NC&>?v$^D?K7E4Py>dE;-2}q zR+-h`tu!5knuq4RDHdjl%ko{j$2Fw;_anwwmJqwM?ZYLlDpYyS0Id!5FvULmwo`W6 z*8+l6|6bCPr)cI?FPXnpbRM^is6xn1%)YoTD% zdZ1G9fa|(pnc&t($j zB?ajzWWY3v=_!?J`Hw$HvxTWn%!Fr9S#F&~nD`YviOrT#v{ugEzf9RWO5auwHOSj~ z!@Jl^6a9o^e|@899jPjQ7b&66}OuubF>X`*=ykjigo@RhPzfIq9l2KVYp!)Q9ic5bb#}P)3oQ#FNRVUAiel*YG9TQ`yv{ zSkyoXMe?mdOw3{IN}+d=!z|1c6ap7;6fZPx_=KoBSjSX{9FX;v6a0y zbw4D~bMjvOeV=GW0ds)Pe{oUC8rSnqD6|%OOb;AEp%~44hb4R&~CmKrZrk+5d(WvL7Kj5}W2p{S*dChPnS6DwK%zc$;h_|CR8GcpgLO$f|&=TNmxp6VP=UH{_o+jgrb)TFU`s z%&Qm5#4hubam@S(zo(TA{v*I6Jns+)2;1U|s=NWt<8||+fcs^n{;JGY*F!%8@KOuV zw(KT!kWR3&1ivp!5Bu+kdS`8_NxIns%0uqk-&`VlljX_KXYg@{Yu|@ng4YAT*8`xV z%XjyZXg9)jkmU>*;6}6aFB=(7l17z)UHn;*`?Ge~RtNa2@d6;~`}N89Vy?>9btC;PXAAHSuQGa930?YpcU>ya z##A9xL9q04lcuab&!|Ppqi==NY9lS}W{l_GB;!@A}<}k=|K>FSk5YV-q z)K2=|)wPjj9Ij_rS1z$jl{_?toAX`)-hjd(Z+Bh0_Vw{N*JWDD-n|zm=XsanK8Hqy z{QY&9f`|C9drlI=xn0?IW@X#=k^ds#fcJo_;xaLYiN<3P4^&>=1E=7%Jfj8w7x3^c z>z>E9fX6Ct0B!i(w*e|T&q}sW;rCKeKP^pWHV^IldYzl=&;tv|EtbIZkSMWp#kOf} zT805J=ij&_x5dGKGMfh!PL=|Ok~s;u5FqGiyWAiWm|n16W!wn-#NXVD#4{V;`?UW@+*6T&L9bIP<@Y#~TkjA_*Z2_X_wX?o z#1Mz|9D-vvg_pRLY1_E;4GP{32y&2Be1Up@`hf`Qdd!^v8VzvHKvdJdXcYOf$Kcnp z9YNE$WR`g-{Kfa_w8`&Sd*kyCl(u=c>+a)wBy=F~wuEqqv+*!62!s#If0Igh_keIf z6L8vTOh^LH4xhU-H=g$?N#Bq@K(Nqy%Qk*z$}tc3YnnbHT?F99ls$^x?L?E6L1Q6z^?)Q82Y%W z#)PDy7elf@_*SNQ`T>cvy8;-F!#yb|s9ig^ha{F)IM%+uA|@5@jc*5QGOyw@#u%%Y zLYyNBhSMH~8S}bsh4Q(Q1fCdf-%DD8py2)kpy=kSOGVx9j`{g|Qlixt+ZzZzs8nKS zZ!da39VgKf8W{Sp28@49X1xYHcPD2774Du_!K@b$Iwnwu;jPn0hbmkA0m4J}?Nl*{ z=ZcGu`0j+EvjZQpFJ&0IIWIYu-^0T9F!#|$UB|J%EI3so&ji8+p|BHzz+k1UrmO;_ zmOP&-vIpti7}kxo6ne0NNoRdZOx$(2#Mida^?JMU_IjWDw0ZzXd9jg)ucHUXK zwl_jk^$f(sNb0t&N5%A)xIJFyUK=r6gbkb_gYzz{x+i^z~{oi z1Z`1n(C7U-QqKhX@?9Gx0|{A7&c0QdIjfo4U zi_cYWQwv$rcZ99G!li6I^r0%Wzr8+PP8o*c-ThGSkp63%$QF^!mwGd_(0SJooRfQ7 zZvNX}3>u}!Fj0=33Pb@uO8v83*(u%ZE+lD`+rRK=9HkDhZ@wqS1gK^W#oPLzHCL3F zHd}ZTOo=V)@*45N%hYDY2W^m!a?(yEXd0%UT15KdN1usMfde&LG~|~eK|0pyezK%e zS~8t!6X5&>39)WFQ1U2-CrYPeHpYQi`Aq8}3~3h9m8xlm<=bLLAfh0gPEG<^b(9$2 zyBI@=Z_Cdi=$r znV}l8QSRtBX8s5r1H!WST|M|8r*I4lZ9lC=EK~&sZ4_jPj7|kFf%W}| z2c`b^*m`fZdxrFC?kgr*>t(x4F(n#a;{3O* zvEl(4BtA{IYv94hT54e= zR^d&XYucH)aKv&N&%k@!i9k^EV@TwTVhxhSzlO!J_TwC~ii zmjd4+gd_Q3yQZ0S7i>#SGuJ>aEsto^_mmV7v(z~3Z2g$2Z3VD2#-3w@iWkqX3lgP| z>8z^p2ciC4$KpjzYi^nD@j1DnryVF^1Sn&pq|D0bk)^4y=j?}tgY=Y%H}`<^PM%i4 zzO_2*dV8%XUp219r@^~5AvHqnr6n#R>0~nU)juwiUN3`lUs%$G(IYwu^_}W)=$OJ# znI*&GV(!`QwX3RsK*2hg%snS_v>YlBNz&nijxuH9*f4&SO@Wcc2ESwUK4BY3fL{o z+PpTT%vz|mVqJLjl5o)s80=iox<^U2TL#BiTV=eJ#aW3 z!tv*Csm>ewTIF+rYZdmV(+%8rQ)}DyQkI8pI8Ib^;eio>*j`yus+?z#z^`3t;7Q7L zeH|xo-FpTBwQ;`WhFU(_TXuJbe&>{|Iixa z-eFxUb;l#m$bGT{fDKZ|NrO~az31PZz}qJ3B^gkXjfw^fzSquW|{D$?{q@7ogZ-Z2y=g zGOCg@6q;ZHsDfr+55dO6Od+hh^L9pDzXiDkx7}a|ArdZa#t8pJN@pfv7n!`>l8ry9c}Ls*nl%c40buPEDIG z=c8AQWXv8vwa1HTMRZF#{U=&IT71-OHC(nUFSnKxU#B@gq zSlt9RxGdY$xUyj{v$j|atjWrt?R;ISgUb2~jk@Kq4Dz^cTeG;f7#NMt6{D*3?(Lc;1yG{yR| z+7l`v{G5`=8E9;4#OI7j(uB+p+C-9Cdhx}abXzqeH=D zhcvsx%W-_Q^TB0heU5f~(K>evC-QmxT@o15R!G}a&0yLAs`D7e1YcrPjV4l{ktXbk zPMUG%B6(g*735m{dwwLuWsM~a8HHVLD6&{>qfE^B6S=z+ra}J-I-ZvKPJX?1b6F}8 zWWw>J2b1~I9%MdAa%}wqy1GB{U@EREVJ~`{E#dF~*>x(8II|G*cJ23|EZa=75>+31 zdj%X(88h>jk8o|GsLo^$0*6{Z2Qa+@mk9XV4z2+xV<}DZD7Y?DLZ%pD=~my%+O~K% zkeSJ;L^OY`x@~eWaH>#_?|TA~i}QWmIZ1T4!ONrsc81OKF7jrt@aaH?DX3&PnT4#S zd^ab-skj|^ov#gu2$C=|q3$BP%eCIt7&y~?A5jkfneUB2o@3?Z<8f8j)d2@=R9h?^#M=20?nGutW`69b@Bx_5}IpO zxfaiO=FjgRtfL+7!^^4TcVM=Yxwf;5p2N+jvUpkhVYy_q_!BWBc5J!+se}v>S8}pEiI?U}o<2OdnyP1u}5q2M0 zN@^vHV~RmOY9Yoy^F_qEyRSs$Uott>GL-e_VMk2J39n?2CwP91_`y_(wrB^d5&5kn z4n*_hHf({CI=7NGNmMtVp67uBz1cG_T!_$CAIyr`bo;zyVL~D3v9+Y7S~rZFGPUZy z6U}ZO9YLGs*cWq(=xwiU-@$gYn;=79V=-q}o-O@z$5w%e=aV*q9AVs=8;m1mVxlWp zGaxhVcIyJc;@kJLoxHQVZVDENs?;lf3mQVCbG7vMayzc;dm88k7D@f-)b<+?_TMN6 zMK7L=1XZ7Qqqeg3qpT^9XG>zx$eqUWfX-GqQ5(;OgRm|y@x4w^oL3^me%9}*y;hF0 zGHrOi+JIik$rc^PVQ&m)m4B;uUZHt4DZLb!WH>V6q%jbdRYTr1Q1D)n-^_m8bg~Uh z%cU-#YRHTmXUh9g`N`){mBrvuZ$Q=hCzS{xY5T!uGTzxBNp82?*pQB z`G5~}HH<4997Y!F&F!mn0kAC7A(6%z5NI;N*@_&e_s^Q-$JLagy=gC}-y`IV4+!Fa z)jBTBAAU}%OzynFsWuEgg3kOf-Gg>m-d>%njUn@B$#>YeamC^j{SM}E8w0qp&_6KJ zpN&9wNX`8XWP?t0>P!w;?HMVBh7#k#ydRC(j5&-XJ1X3ZAn~PbS1Znm*cAi4v1zf)m>FR6k}qf4I~7= zqqvViJJ0_{}u-nVYz~eJp-l z$sT`U>wODSPsFE@2xy@KXD-mLUT*{!#IW-o(jC~ z2w3c{tz?0ogu#CJ6rCre1@59q;2LUfeo0v>YHeprd3T!WfGE5_hLoxx1r=3mp^9;^ z%|)+chEnaJve$TtAJ3%dx-Ip75?IZ%w8Qk^S#|DVev-@EdpfE6WYh@zv%!SEVxTdl zJgKf9UHgLTfTKjh`<#yTyvUl&3ljU|X6J2%J=6k_tl2q}qON)RHMN`;6!h(&yKj8gs^$kPTps)aMY(r!`7w>{0igJqs2 zMAKi6Znz^q;;Bv1jQ{hH3@cOMr42vzZ@Tu{+Q)0+2sPbRDCwyxG4JmLRTUq+EF?M<0B(y3 zFN0Hby_-AP{VInOEK=0mo>$2Iv{Wb$W z7Dw>bDHxVoH5JBvH0@#$6`2dR8H*x`JMJ=nMltyS9qF&R^-FC^Q}OQcra94PAmqao?nZtjoU zjfuV;zfK^$Lo80$v?88@24>8jXEJLe(oOif^~o;{j%^38Y2-|M<5A5xJ`L(H+IG{d zJ==$E695jK=dJWLi|H=ER?Mv%1R^dPhcsr+gqW+dR0 z*n166zpYf3zJVt-+$~%4oVd1w;_H42z%_N`p~ZgiXN=6EgSk0thmP<1TcrufrM-zY z;=5n-OXRPHS*OxaH+XprS8iV;Bg@8PSCfD0hSkrCebIMS+&gzPe$eM=QC()Mno;g$ zI&pTh6i0C%d*h`tDaIFtQ-~7)61I4dJ#Imj%_Ur4SE3k9a6{zkJo;($YAy$GK87wy z_uqXV%Nbj1G;(EMh@h#&oSeuhuv_{NiE{x*Q@&|(dGW~{zw4xS)NRF<18U>}z>(v` zX>2~BdlhnKJNLZk6nJYF5feRSI`I#dIbl`d@l|>pSGL&|{{CSc%cnMnZ9oa4=9)`X zUFF)CPkzS+3&wWQZu}$=~K7WOV?db@W|MAp=ft;d3WGmc&@oFIQD$w zrLaO&%Y*#!x{cBG?$}mNvTZkt*3L}8^D2cllYw;lJC0iVQzaFz^3S1**QRK|?`*?| zmDHA6uRsZ{fj{dF?L1j9g)iQmxufOQXcL=zCNj<oIv#YvVVjyT!GPHkD=ai2`q90&2- z!+PUQ1psJwJ}7L($mjL>ug3U5rD*hf60asHPf)ZOP!bup{_iHb$bmzeUFoW@SlHo5 z;qcqH@)%**6VP*7Nt0%k)5A+<-+;^0vB-#q70SO*6nK~KybDVi#xGB8Y|{n! zeVaaAEma6Gp*sLp&#G89v_8P{NmYlCO9-~ON&(K3i1zLueiJc*;5|j*>|8W-;QLn& zwK>R0v<0CrCD5{4%dNR#YNiv2U~fz-aTC~MRak6BC0lJz<4+@`{#0DD;Oso%)u+k= zho{)14VFlm`5c%jyRwQ|A;m1uKV>e~*v{R;BK-~;>D)2CcWWShgah>Pb0FEjVc)z3 ztt0Q)gpsNJ%dOr@J4_|ju_VbI-n#R?9-h$(hJ^{)AQvZ4KEVe&d(yNhAFv*JD+neeH&g0qSq61f#aKaMYmy^ zlvoLL>LVa;ieOTTl0WSaWJTouv_(*^n$FCh;U#Hwu$qAjh{IpO^Jb?i<@R{NrsSZs+inMF0rm1E`UWR7v#WB)(^@_w7bgh?)N2tySXf?UwAl# zRLjUkXk05yH5EAHn7Y@uMNLbyFl}wLIeh6t-cFGyfZjE=5%gyJ2 zGeivz{2?`kX`o0uZI zMkT;@2UhLm`&i~!I5e}7ffDU<@#GLVk--BT#0BG}N&TPiVRY*P8q9DwkZO`(z9HQ(KX@>uB}p$X~eOMTB)4 zAfKWaigK4XGKxet}CgI9D*5#`3BXmNr8bp_p*HVg6+WC$Qg{ z5g)!qK==^iLoI7Q*3L#tEI0Ldax9sO&=cjvUlIa${8ASFBPn(cTgUmppkZe83%uXD6c+cgG$_WfDL5GZ20l^`&rfK^q&0Q!X-tn zH?w2FOP0rn*?oA~^#xdR1kgFN_YgNKI`cSe@m(Ut$gFZwC+7Wb@C`&g7LDcD{$OMdiKXU(>vGTB8Kmc494&|;y66-^^a7)qpotfCHCe#!FmP0E8jDoB?^a|n=8TSlSqS#Ek-?v`KQ|pG zw1pyOi=$e49vQ5`2NGT&qU|r5@Yt|PR$euLTA!w=l z#r1OX+`fd3O%?tgI5xCn%Ofw<_1H^Z*M1HK#q)IyRgdIxYcATf>k7?TkZ)emQR)*> z_b{!a|HRxi8deXBa#@89*z={t3aKj;w)~1+6dxY1pPhXSYji#Flhwx9Vvc9vWOpxC z%VhBQ@!NCzC-({Su!|webs7j$NYa$zuuREF4KPF|?!(GXQ9)gk;Atq40BY|Cqvd%t zDz{ncC=Pu6iT%bs{o!q$Z=P za7timbbMS7uf>F2t)PkI_0{X^q}eH#Zd$S9xyqn;X>97C`*Zc&Ui>+&F#J|{TN~b< zm6ibmFCdzIIZP!Dw5%Ap6@d@d& zZ)(KS7Qu9=hpzLiy#sj{$Q0W+*vP33x>tog>oi1*@|%4~j~Gc{C&`#xLCW~k?-&1j zG`w1YYwwGmy{Jfo`{{d9aiPC41ksQfu`Pc-?hwrE{j}rs9s6dY9DuMd z|79J^cgJ{EKaa`uXzI>pc0^tZE<6i{WNgM-rZT|YDU*uz;R_r5UmX4c5#s{&c!&c; zCSh#MUQDSZlc=I~CcLLe-YJbkkM8@!%bF7O>pmjvk#A6lp3)c1$@VA8gm-A{POZl* z;le|0g)5I(4Ixads@m5^@}cWT$K7qCbWK?&1LF2cI$H7*7kqHX-}ce`A}xM!=s&I9 zawTbVpV^YeyQ3i%b{6y~bRMVSB#6QKLXg`btcgmkZj7!>Yo!Wcos>q_Wm0C?@WKBB(glLxCK}V{ zen=Bq6{e7@?a2J4hi^Q?QA3NR0avC0c>x6zi`_#w`)gOil!H4$^=f55Q0CWJj5#L7 z&qXnvEi8ldtzCJwAPr38lSV-5IPJ|4%pA6K^b#}DaLSAyLG3$u{0ZV)(Q12p zNran@jJw+g${2c=TTq5iDHq5z+#6149uDhS7ov&5A5%c#s^fGYwsXAB!h5w=c3xJ! z&O2JX-v<4jB?@=%&orBg%Z+Jq5<|XjnO@JMOpg+hVJ(x$CON7^)r@7u$8G~hoOYQ& z7)POj*0K3&{NI=edF$%{>p){L9BRySnb+`R1;c_cH!4~HXB$97fGPd9eOyu8T(72X zR2Xyt?=k0j6h)$=V0qR+|&G^2-`8*cm@i50j5{Mqvu!SMER} z7%^DUVLjO1NJ-v)%=(KM{3{;PouCtuBmOR=o0T zb+L;tw)1|}$4OJrHck-Z5;jS9WY~uI_DmNO{rY;v(H4h72hFx2@Ukc&!*~NRCu8Kl zy#&%K{>Dak5`La_qG@gu+-v;^FhN;@_}R2_1D}jT!vE-?lcX;u7N3p~i$m1rpu^4X zdGTh?!%L}Q=pBq)6rxbF$0|dZPD~D-Tf;`$tul*iK%2>4Mnxe($0YQt|B)amk((F! zP(*YSk}wT_2k;>!h+M`W_#=c$2&8Zo}3(Vl1=rcJ#u5 zH}h>Fj-UAgOdtj_81Kyw1f>kaQhT*3sjdO(q%P$N?wAl_c(@kE-z-#x2yQW`*nbp= zxl8>?Rr}@+x30YPh>m}&L%dE`rREJKZ9yjXnJB(eN!O=Lia$N1%A&oJc5>N2tt0f+ zvpGaZm9dBsHE{AG{qjI(tjJ4dXP!nHb9`LDTBP zX`ykFs5=cOYi0ub?2Z&JEmTmrr3)5c^*y>!zRZAQ2byd-=od$hWq3G!A}3;CX^lA8 zxiE9>=03L!UzO4}+BQ^NEq$S*=0-&I)e^as{cQziVzL<$jfm2; zme@R*%O~p}kcm@9HD>n-YtcOcSl~=Ohau_t`x*02R$ocfe>KXNF9{z1>~g(3GyF2& zCd-xl-0^a^V$A83<-sXwXau>m?6<9#YD(Ra$ngc6q^v1UiEk>u!ToQLVYJKI4<+kz z+8_2}oW&f=l^6+mU)D32jK#47dS-fO-~-8P7TcAha(ZW~*R{XD?2lsWTc0ltQkf1F z)Sog>Vby-amXbM{?*D9;6T|k~PJpOV3E=-&l&jW@$rBevwKz0O6_3$F|ST0G9k=pUZ(G$J)yW-INnDLB%WRprs$Xw=Z#wBc|4b_zESr!bKPJQCNvEvUCt3GC27h ztaajO<@f7zL|7`=w$8(Jc~1xP8K%od07^^m`4a(;)&nNNO(^e|_&(}UZ4UEv|2G|& z8l-`35@^DfmAss$ugdg60Qz!E^9l*$o;d}=pI&IfEXh38@-IB#(djA^{R!24zY2i zc}5}leuylnQBF;Qnt)Shn~0o8O%Yf zN|9EA@~!pmFs8u|)kpOlgmx$_IF{$6ukD5{p=i8r{)$dj=8;^{e2%gP{ zdeALk1Eno@S$VfDccNshm@?fI$xAH8q^T16n}fc(Q`ei46iG!X4uiwx^U*ZsAFyf_ znY8^(fbCJy?U1Id%F_B`G=(&HGBoO*2fiFmQd0|xy;`Geyj7MdELkXv(a6JCqu4Ki z+$1#Ut;+ptdCM$QkG>veSWZlO*-EI^J*rGk(Kxs4=DJX*zd z31BTG9c|=rG#U#WZ0@_HTZ-&%)a_HC3{2lP!V)L4|P}Yqv|Prd+0B2T>L9 z4S#JmVF-Z5SWikK+a$Y#cF|(^nqc~$DSx$VmSPX8(di?Nt+znsnpT<%9(ubyxyF zaNUZo*Xt_%1(}wRxi+Z{+mZ?yt^E$$S^dMGUPiPDUy<71>^`Y^tEbATo!6|2@mF57h5+` z>!kj*Qg$=iH%sSj`rgDPun1ttdbxEeE^irVb=320~#EHlQMJnt`uRU?_Nj@995?6>< zI$;HA(LccQpem?U_zc{<7H|Z%x;toLkFmxWpy`TK#X$ZYmP20r-@zU8l!~3?@aangt zFT(1~{d7p$>>*y=1GNe-+doephh-`CmHRTG*%!xrB3E`HxE<}#QnOe&C!O{s0klS4 zE6lJ<7n$@D)1NaVINWw784?N!?rbm?JCB?3w@%GSQY(2JAZYV6W*g3g!|9cZ)S}Lg zU2MtYAbXPcT2{ixJ^n#i!}=$yHEMHdU(O#L`>_)T?`U19<5p%~YM4deKB`J@NB8J{ z95!mYNjGwcH5B)#u5lb!g$xjCz%L?y=9M!g1>OVcfjnC%YTHGfkYIXtqNWWinqZE` zV)GA8K6>u%qN5ezVN1wOw0aKfup#@fHl{HR(9gy#a5!gjgOGbFzvQ3|Z+6dr@jVKq z%JT0J?K$#54H6qXNnaMzCxW=%_$;35o_`TtVEQQ}zb$Vrb61j;?CuUTYqH2TtQD;5 zx*RViXsp`n=Jqz8%5RKpC{HJkcHUEVjq>!$+fkOd$rulAD0pw9rm$_>oB1#A{F3e& z;GfhT8Pf(NP85aG@AUcD6ZRXrUasQ+mRg_tV-(X&*Q*x{(Fi%>CkWFPT?V<4zvp_2 zBdi}J{9{HevK`}T+zoJxwgY${^43DrQ( zKGW&gZ*Ur>r{_sP$|iB{&LZJVuN)4~hS;hFmyJ$5M((hd&v5~CB4d3NhF&0(vFAV` z3Se<2oJXgsSHTS4zm!~iQY7;aIHhB5?uHe7r=MY7k#M4#;LTAX-!2K>exnL(hmj0f zIqfl^VB$kQI+c3MzTxK}naG&oK%kx}>7!CR7;!u;hSdb9>TAz5=^k2cbbC5OLj**V z*x1%k%8+75kIBGXaOJ%YTzL|rE}XcNt=A3IUiPiAS$_U0b3$wOW<^zGNM6I_ENe#7 z7cW^;8MFea;&JjKwdwqTGvQ{yq#dcFkwq0OLY%=Ny`0Q$1)}i_@1`UMnEnw8wyVoF zas4sT<8c*;bk8HcdiPxP#zU1m!7Jh^tnB!dTk_zI^IBUmW~3gVZ`RY72>8D7*@EcI z=`v$@yL3@4f$hOHuE>pyXZNTV*L|ZRC93ce%nZr7kO)BI{bm~KDaC#)qL~x%fO`b<- z>4doRONjqr9s&KJ53$##?<0bQcBJ+y=_AVtQKvw)^9m7?$_*fr@Ru5Z-`T5K&1)W$ya3qyQi01p)NH%ZI=p~QK@nMkQSl0778WbzI!g1R>FOeF*LUO{rOQ1HqiQFi zn5`RMMv3NfcmpZ|1yMuoq9)eIW|@}qmWzNSLW7+Lnem$WMe&^~v-ZsQ;$RX@65xNN zP?t8e=o^7OR@-`z<3+q_(N2`MXdaLrE{i)RVhhk#!?kamI&GMKEge$L9HVR7{Hhzi zVr=C^uUbf8`cV|w&#KIT^Ba7ql$#A1i$~dY!?;-Q4RAoI2 zOo(PXy63VF9Ty4Hl|CIj^_WUGVmN8%bALt&rM{7y%)wZnN8W&=nH5Qysn~$E2M!ia zY+sLl3XOTX#;LJsm~fiOo~v0~a1YrfaQDnkb^((K9ov%B+<3KHV&IODxB@vj9)wUN z7u~o3%Hp!_>I^D40kSLZOBdc$rt%{GaPGZ`9ETI@Ugx&p$E>rQB&r&i`-3C4^>i#5 zzdwKKPp^|Nxy@k=nt#H0M=y&SZtG%(7L&!&^$=PjthpzzMUc1(O}=n$`hVE|+_MWE zyF$-oqnWi}#E{DE7%~VQ0MIOrXO2&X+zA=uZjEEUHN~gYEXErR@Q3y({gBm|u8A5Q z&=0sE5Grll5nNF=QoON@!K!`0yk$uHha;;4uSm0_*G;5(VO9@Qn z?)`}41qe)U*8*>-A1uMZP4n20M7H{xp*>R7Jc1AR*gR6Tbs-7^p!YyhxJ{S`6{bC_ zq`b8TV+9InG^kO;5M#*&izhbM%|Jd|WKZzMp4B)fp%K!kQ9@i2yU=mT(eN;x=xaBf zwG&L&z=+0qfahuD1;C{Wq1LhQBXZ2Hx!kShS6$!AM~Tgkr{9y}68AO@QBP_`*b@bp z?n*AWQ$ete%E}L3RRM_VVr0m8xLjy5nlM<``{Tx9-Pvh@UuyR=tdTrlwT;0fXB}R7 z9RnPU+bHQ`(JG@m5B3O=FQhlmf%{caydU3$zQ&u9Lj;QWA4-jxHelqc1mF~?${85jFWP2jX%E}4MIXv1(o;mcBi4# z4*6&N3N?6Kkda-)NLUTBEanXmq8=!tV&HYfc%!kiPJqx?qAs> z)z|`xqu4;4R90b;VH#wb;Kh!zgk|LK7P7j7!X)BONCU1Z_}1lh2*k2~cZB!MxpOZ! z+?4M8U0f1cLkfcf4_#BzVa8zo;e{6l`>0CiGD*uO%dZw$SwWwEm-*Jt2JmH^N^xQQ zGRw#^YJ835kh<3?aPW+ejZ$M`mJZV*(SsSj(9*apnLnPgG#XQFpX}p449Q%p;ekx$ z%}pRinKaFYKJG6JM@?79jrLz>aq-orS)1E$0IPVa@X0$0Cp`J+o3}G$mXWK+53mI; z-mgF_K}O|1n_Y0Mlu(J;iQ|G6*+$JYp|zjJB1@VUk)=4I$JCaFh{bEaAtq( zSpvYP&7Q)s@a(4e)^?hDRk9?$x(DK=wrB>8(&{TMY0_yseC++;>(sSHoPATllb5gS;`27#D6XP@r?%gaNanSai(ng&Kyh} zk9~dv$^o#M5iXJUSp3C$IHY?;(k=BPc z?9d`>`|YXI(>*n&{@@t?fONai%AgSAt1~T{7hiL#JLf@%O^(s;dniI+Rd}EA-J6y^ z^2ey|IK3hGxlYIEAbs13^2WHs`|Uq&2-be4Oo;Q#a7=2%z9cs5%_jJnoY@-g#AkMzq3wDW=smK#<&{U=*LL4bUpMGI-o4hpYQVf$b9mU|}xr*BwmSysv9Rs?lnjQxd^R@qI(p&??ICR>upDF3;NI`)rI zi$P}|n$%PEB`1pqalwA~V&HtyAm~HO;$YHuq47*>po2|}6cofz{~D+!r=;CFkhvB_ zwT?!}5!9=7--->Af^DBMo=;ZDvb@qWGr$RsdNh4!2wdJH+qJX(QTRT1SF$@xqWI6% zLQe@W@EHrb_PgF+=>Lngw~VT)d&7Pe5CH+{2I){br8}fUy1P>vBqXG}8)*flyStH) z?vn1#GdIukf6n`kaXy}J%8>2ed#yF+J+J$^ez&#e1DC!tz2M&T7WyYC>G0~suQ7DR z0`pd!HcbiQJUwIu@9Qy1ztq=8bG-Lzcc*Bb^!Iz;9`d@;ohrNYQgM{AV0$3@gz153 zliqml-`FXoNS2Ap4l|KxH{$F{j0wvR>gNlSqGid6nuU0gH|iw$FgzNHtF|o;>g7Rg zH(nDq>O0A{6FO9}<*{vki5_tT?2m!3wToDx*`UB#GADWk5)9RbV^ocB`C7KUUK@p{ z9SgFkMTL}yI^~u53PHM-p52w8gkK2x&<#oQf=qLrL$xE@Ehoa0Gkf9A`b($mTFx`W zitu7CcKxa*g;Oz|r<(>6pjGk}D7X%&Hs)uq8NIlO6DK{CqOUV@9ZooC@=LnSjg_4R znKx;g^MB%$t;IsE*?kq?TV`f(MfB>gJzyJBl%HPfw*cg>AhX>y{<&)2MQ#ZD6;z*d zzAS*QP?^L~-F~U@Ns9;6(rnwusC$}Qrnui8J3ox|YIap#{EB|bHxN$hH=h-RDNqfJFbJAt zU^;gql(q57_20}xEd>?(xdYp>F08W4?<`z@Y_bC+_t$J67p<#jx|y6+bw5`bO2t*^ zwLx*;AAoq7b!v1padcQV$NG-ZfGK?sD06F*SInUFfviAcl z)FsxXW0`)WcoEv$fQHboB*%+M#|G_rhHC z#ryNu0Or5~@C&SaJZQIwX&^s3+@cg33*k-7p1{$)6X~;epd(eSCF=mLNu>cSrlxd+ zTyiExZ_b`UdsC3@#Owrg-%8Vo{j~`ePllUoOYfbd2%kr2uNdL;04dWP>(K0?L$H#!B_0^4OMVi5 zp<3KaS2M5ple!aD09)G;Rb3z0q{OeOlRcLcKJBXLxEVVuR7E`6+>=;$qC4Q4!nYEL2Y0 za(4pK&{Ep(oqsW~JRt#E$x&*-si}6m@ihHV%Ou&DGMcwy&;jNqhVrG5hR%|mn(#N) zWne`|;$8%q=J1zm_nUj#4Y|kMEcU}Ggf#ovx{mftwAvTj*3=t=3kWN25%FsUQJ+S5 zIj*L)swCD+5+KQx5MxZ$^5CPEziRECbxrDcLjwcgl1jkbi298GRBv%t+pY1J&V)M) zY8ALj{T`0Y(0C+N&&Frcw%JZtY-OnVX^}cX3CH6-Hj7kst!JnsJF*8iU-iw8%EcI# z2b=yn5a7N}{-kKR_iJELwvP{Zs;Aqiw;od3HYkWff0x#>nJ0WpenfuhKyfPg=`v19 zw(Vwc4=V#Tc~??u-|5*>!-T}c&J$h(vZFyRxgkfm0)dlbqdiA!W-Z>mYc8DNSy)}h%lG4GxgHX1VrFhX)#JZA>R7uY zSZ@L-kxFM}5)hF`R{caCK0MRsRv&LPz3{dQ1e+UBTJUBrly5-RU58oO&L^?V$Q+-M zVK_lgyEb4MkX!8LSaFlVa+~v%%wKF^_ee8w?;%&Z_yWn#hjw|0nWj4|QJ@u$n^q;g z6JHswCKMQ%u-EYTJU;~7w{87RBYd}NB;9}0$`?O}XpnrjRGXSpQ6-I+w$T%1U}8bT zfw^KCi!Npb7lc^vKD3NlrGcC{SU?x30A;w2352YmgOYS|zVtTF?euxbDWSW$ zaQ+7vgr^69!8;Y1LA(DS@TsC(3BsST9uB4V}+bfnXemsb#$Ka0E})hfMR5%DHOkuLZvIenHCRcSGd=- z1d0@ok=Xx%AZAIIx4Np2!6lUDgc>jLo||m-+Kk=m&SjDP_On`jZ^A^$MLfBxDfMY5 z33=K{x{ys$dvJWWHxC0|VRO+Mq4~&itoq`vsX{c&heWv=0$J0b{TWmeeJt9Gk= zdT;xGbI$E5svuiZBiy=1%*to>Sc{o6hJhVW4$9x{%MSx4!)y^bJC64`U={^(G_I_C zeZzM?p|t%g;>;PV_ER?j<_-*MNENhLy1YRLR)N2D<8im0v(M#t#szCk?+a`#Jt`rH z#xt@Cs7gf)Kzll{4@**N$1u3hYaG7uoSR!zLCf^I*+y0x#bE;B+E6hrp$QsOkbUt= zd$Tjhio2h_p2oxmVi{w|il0u3nmcW}Ut!U@4u8t1xIJh{WM=sX&LP<9+eMM)8!gt_ zHYvRL;5$8H+juNSb@aaW6L6kr7=}N+1i(kN>yx|#fQ8ji1{>o5pSAabe)6`>*Cv&8 z6tJS1ATa&a)icbU@L_-5)@nATED*B+kQcHI7O%QFNNxXQdnXRiRty3P!$oZ79XT2| zA%uw_&__>5!3VZnsxRCgaamx;rFtKu)%cBv1qe4*K)cIDVE;swN>??ZXzy7Y+ogLF zlPKbQ%>nKlF+zx-l(RTv^_WD$h;pAwv`5Wo>IFSuKJQHC4j4^^Q`M~EBUq|%wHviI z!({jX7?H;LN$cAp2*AXLOWE8$!2vk z8tKK_i{GZ}v0qiW?_dJo{MyTUXga74?16nSukY@Agoa_kvATw(@mG2O+pyE<4Ekme zate)$qHhMS-Ur?u=#QQ^H4-$zP=br?K!Ev9P4f8k_&_eRpD+Yu;=8pJHqvstaA@|! zY83g=#4dlYdg1qy$Q2+ugmI6um*>)BYvpDh4cZ3NjPrC18QsP=LPek)xqu>rAL#1) zK_an~Q^3)TOp4og4~e?pm3YV0A$E8X%5nd<)iILk^^X)9?-6c_nap7k`uZN#n3UX3 zz0qVXnesmad?z=E-f@*QOqU_((Q<#0gVX>a_~bkGNxQ+jz;`M%XpO-kIWfCeOF7$) z&&{^p>@(;8teb#miQ(U%kt?b=Q!`Ab=zu(YV^2@(iUf#=BSx2?& z7Dt=3OXWK}tfVcJI@~oHd5#M`Qli_vG!_zH*_>LdG9*b1{d>LiiMH)o>qZHQjp0qaq-O6i7 zpNZlKC(>wsd~d;Ix3Zk9jLn2}glm@;iRJJ9uPRd=m)u3_y5@;ziGY-Vt)3k*0y*cB zLkWL*=8*UGx3rKduGx=b@^7z@Dnr!OTn~pWTXBQ#(>ypc_klY<;cYSUM+;PEvzp@h z-V*@bSTfE042SprJgnuRG)GG(2zc#7j;3#>cf#)0c3^0aRAQBGN^T9W=il3LyA2gZq)mp z!hyWRW+TOz9fPZlG5nI%8}!*aqmP7!WN&oX?N~O@ddPF)B(w5L1M!H8bX3kz;G_cs z|Dx=N=ENwZz{h6B-m{Pn2*2hP@narS98CIX94uKS2rp^)MY()5sjG5-p2u49$6cOe zpUrbSX+JT}>l#eou6E8>1+v#-EOF+CAyoUmRVrErjl4!;wwi@4LHq>0QePIKzn|`$ zJ@ht7m;38yU25d>Y!J6*o`{+F2Dhq_GFF`~HE-a~w@_^1Dy>@G1o|c}psFXlU>i&u zlXngCP`O?;F-IItQl@8_>6Q!gu?M-KbTnHERG|%t&cfgf>jJ@nn9&91N4iiN39X+x z9ky{|0|)>b@R1}VpB#x(Kxcm5HGN+3S4_N|O_cP0%ESWgEVYpZ!?ms)C!MVItC!!r z#nFo^dOG!$v)j}hJgYK32GJg zEG4Qhqu(9Sa>Z+&ovS$21qxx!_%(K|S;fHou)v_spDZ?;rf61xTYY_k^~>7%kb0<4 zDZ(X*U*(Wc`r&}^hcu)dgTUo}JENs|%_n{7EDBJWbJoGS(_RXZI!0AjotPvVet$Ss zsQR-ycsInv?TYEPE;&^_Le!Zb++Uz;2UcHkLINUwbm!-mCxSm>>x>d?86DGo78+>Z zhPoQj%8xszH?t9(26y`KaHG)J(r zu4U{hsj@cYW=%#s9GP_32aedEPEAaoMGoa^j)&@_{dLi@T#=M_!tS??6CP1v;P$jn zMq>}Q;cC1p<0uc?-Hy{m;Y7Y z`UX0^p;_PE8s0|IzH?|pvTN{l?XkmOTGBsOOEj+gUs?rtx5WH#XqbpFaCWb{WepRXr;3dUjiWMSK&hhS|ZV zOMCdsTM7w7-(*@=r)V=x(UlpUF=?FT#_k;(RL;$WB&tSCGtS2a!kYENgDXHs05z!T zlkUflQfb;UCQG}8V~jpcJt^mjQ#jLszdK=B^;)yxPB*f=WE+Fz{TgrksYAoq+~kr757~P?+G$JR;}}!> zxCe!Da7n`{Ge7r0SHHlfdDZ z8rZ!#7F`VmxQ1f?W*5=?O;Vp=uq88uwtB!&(LbJ8*3qvQB^l&9ama9Vl>Fd%vt5vS z^}q7X!KLyM9I$@bCP=IU){ql!Nt~T*FrJ$}AW^dCyPb8FQJ~SPmHJd{95fu#r?o)7 z3MUwoKR1U8-fA-suu#w|hX5cos-X92bFj_RqyYfD&}=Xubg<7?#VD;=z03r7sBaTR z&-!u%pMCCMrRS^uf?`bS)UZ`I*Q@3N{Zz22AaFTLUjbBBRal#Yv&VzBhRINWXKq$*p(adZFvQbdFeN(Hs+X+KedNBC>{SHt^v6$ja zavN3x@!x(`&1F_zNvN+CjUu7G|C{Ok0MJ+vXjrfdRxOsn&3nDLEuT`F85>k)poH{< z_%ymXeLDr}Piz(n=Ef5EVkKZKU>vZE@xl@1pnQSn|K)0n$pR7CbstrxbafSu9=IYu zUa!+@x93Bsm41z=ci@C+nfd%37U^?PxzTNNzo0%af_gdA9qBa!EBnL5wV;7zSX}Jo zo>~R9B6I!wygOT5B3GZb`D+)_JaPmWw^uz0!cA5h6GhgznUGZva`r-m@&zRuWh3FH zq^7ZCIb0j=&$xwZ%Li$HUC-mLT6AkNsU3&y+_NUDCE?^vTpP-{U+b89UYCiZ4}bt` z;!*Hnyv1c&|7CR8`UBCYL8gaE{ywch{g#M=^r^&glxnr%!7m@aPBm__fs_0&>qL?= z;1Hri>go#gZ+qvzNf}WdCN)ocysSnmb*nch5k+*Dg$G}`P8N2RGGOq2P!2fRi{TrxDOiv%v4deEFXRkxa!j^LQC+|Tfo^x zeQF3N;JdXShR#+vE4mJI{3o|HQy_?#zRtQn{rmMnhO%@B1)U3lqlIV=N>8NDHRF-D z2MzHV&m#V6m%s9B`sayP0G77^L#*7n_|@;nJ1JSPQ$n66G?+kx`G6@nrAYJ z;`lyh36sV%G}xcMHixPQh^8=L>mf2IEwg3)Q=QF%H|7a&ZbhM?R{%s?b5*;#JfcSh{d_4q`@-qaJK1oTM(EQSSCc@IScn9c`t>-hq9oPwGv zg?F_7i)$roesQ}ATnw4OHPp~JJmko@v7A%Cr`WB)=amHhsss{^j!%zZy)W(D5{%nH1 zHOdw_CyDLiGlHA(g9>|0Y9PIu=iyc&5SA?w zM$F1ko1nM&0BKL;7S)YGCw%<(Ofe}c9E=$c><}DFLEmhvKuUT=d?a74Y+NJ{;tF&m}f*9tygU3_uvqB6HG+ffqW*-2$G+IV3+|J$KW^;1#`+@^iToCw8OqrF#Dd#t8kcEcVbUsvjN0YgQ_j z2k=PJ8;F~eiF2GieO3@{zQp`v8PMjbN+IaC_KO<($?r&Jc}CEHT}anY%KLZxy_|O6 zMb19!wZm#K)vVW?J%A-F0m6crk0j7UZ&ck zj&lbq!)wRcBKpq(ymR*Z&KLSQX@zG1U0H_7BUHYWMzaqBxFpVo_YlIK&p9MEB;99n zP)B@{MbACiCCGqK7MuY+!7;4|ZDp}GofE;;-{-KpANfNOf2C=I&XxpM&-h9t?_3vr z^7%zv$17&!EG3CO>aP=J`Qs~gJ=AlKT&^KJxL%!aJ*su9}ELsT`3E`U0b~i2yhZxxZ6D)%cLHc@XaIWHnn~3UuHj(Zexp z01D77LUK9gOf%#f!nW)Vn=a{RcApgx&CnEiq5iVVjTkeV zGixUAZnM5_Am75Erh8q%gG!T=J7BonN^=%IMi)JYXY*_2TWD0YMH zQoLkIedOmoLrHm5vTIOlVy{rg{zh5fM{|5=C%NWs?Q4N7No#P3AuZ`h78=p3qHMW? z$#2+>s)hr1T5>wGHW6=n_UoT->r=7)O8n) zjlz{a|K~2+5ii?E>8A7I(+x@&clWd{jedsl3uE_GOy3P*rF%(c-}i5-Beuti>Wj)@ zURL}_jc4rgsa8L)(w*9Y+O_^n#^+Nq{>0oy^Gy~4O&aS~QGBzpv$xi-(OF!ZceM`< z>WqKpcE-)f_xN+av;v}0PkHZR@!K5ZL6PoC3hT(J_+#v$)N03wILZobdti*|JvvhQ(BH?xV zMkY3{LhRP8wquDRD72#Je|kS)peb$Q{nB-E7ReK!s!n#Id{g>$m(sU;-Y#8Q4IZm> zzkGn+iRV5(Pa}*wp# z6vU9Qx;kuhL}B5h&nZCF`FmD?8X~zr&ko&qI_KGZJk- z{a*Pcis_&Fk_u|UM44*AV2uwclE3s^R7`Wd1`s>3wM#Ihu7y_upgY$bcC>4C>sXiB z^XWdo4ld`x__`;kPRs45{Xp6>3)~4Vq38&BPdP~zZ@wU5I}u?HLI(TArrK+)UqdPC zmfWx7P=gtNJ_vQ_60L28sN)n>WS(-qd}WW&Bq(s*Hr`l29jl$PqH;@9kVeY;gZmY` zQbmAT#;bsM>F!^eeZQRpJqk7z;&_eC+jkae)DvBm-WD_PY*x0t(rdz{DW$Y^qZv=< z!lX!T7*(py1lo7Ttc=)Mh#36>)r)qj`Pq)`CMk~-m8bo1J(^@{r_b{bEIbErm% zW!?=Zv%E}XioHxVwAElzTnF`g~Qz~oYzQ`dQwFzCHzoA1Q@ypPyfCFTWE zrR=U)@*3-|IY~nD3o$qzB((#LmDBUdn&x=Nb3lNVO=Mz~ME+&_`D@aK=|5VR2)#$0 z#5JVrBsqzZfVKfw@o`Pz?XMUn*13idsMc;>#kjw{UFj)O<~H;D8)IR?JB_kN$WAoB zg~}(x(zz(|Fm3R>0ce5+jrzIUCwPp*Dpu^)QSGG9#$O4YiHnn@uK{R!QNs(e@PkF> zBg9>H{yWHgt00X;cKipd`)TcF?(Xo{Da`o7j4U=6oyIbwU*kO#=|vHH%meRmv)k;7 zrMo}vVJ<4B=R55hh*QrK;}2r2=e{~KAN<;-E@+`26utt1#!0~rO@AtqAq$_eDaY=4 zu|xCq)Q2ldf!UiW;2myGY`BHX86Iye_M!LVt<)g@{W!m9(%)+c5_A>!*^ZQ4UGvA< z%=%8U<(MDHj@uZx0b8y0KL^%DOTxn%cduVEB2u#~a>=lr3$rbOQ|4}a(XErmjwbF? zX|5-Rf)jKP{TW6j z)}`(boCKLY*%(E}kL^;nwux>v&dK0LDN${%JnXnVp~dJfe7H1HW-L;M7bgFtS5_eH zYMDIYualIcCC5y!*c*&X%a6+j1MzO^lGMwMbKXVgYtPIb+wUFYx>luE7iZ_*Re{Bw zUdp7DU1{KXMluIGHu{IyPc=>&HT2bN^GWzNGK?Ecn+5Ue3B+=4*Xj^M54g6iTi`eb zX^+m5gZ8Lk2hXsh`K;o!?v89SlJRUFX0m>s<*LT%xOBN-6Pim(wN>kac>+g_zRQn^ zNwBc?8*~680$4q3n z^ie{0y~LkL%BIBfHxn(E$$|OZ@V4X$5$mkF*VD((32J=?hGDGW#IS4bVf>9o{NPcJ z`?QFWLxJGkB08Vm5Fhpy%6{Q{H#a18w`2ry)YDd9krU#o^d8fYE zntUwZxU_t%5n8$@DS8Gh50l;_u*>C<|GsW;D{n+%@)_gisfj0<<+jn)o)8w*K+P)s$#>JVnS zjIg%4=>Oi_Z$d~P@}Q)z;eWsO1V)c6d9zF5bZ8E-!GIWHzkA+PlgN`3-hy7inLh<0lMIxqy6PUr@~v=10xI#GFZH#q`=PB<;Q0{_Ld;osr(AG{G{ z+@Kdm7}ovvz$c2Xdb*u*KZf&@=>D*-0*&(AVji-Ay~FGuRo(j`TQ4ODyyrcN6=XIC z^QrCsJ}4s@WCoiM>D5+gY@00a`NyzY zVC-oI-krMTFRx@F-knv@L#hJ+?cZ{|1RWXi#ui z_Q4LOR)gTh=l!ZFEVaYhHRQOk)GlI&00=V{S?m^|i+r{T_1g+Y})hsx*zo|n7CY| zc=|=A0fQau#Q6i6%dHFa*ORjxa3-^Q{VPNToR20sD2*11M1fAJ3t7eLE*0&ZxQ9POhbC6DK(<^00#27 zRQ*vw$BKFY{tKC0@gv&x=t1po!cjGL)VqoZe;J-0C5Vzj1Oxlc+i@8>v<>h0By%igYyT& zOZF$T4y5n>Ej08RgxEXamgM`B8!+2X3EuU{k$^6~%~4sMMRczwoj#8$@{igW8Gnd2 zwK=dqG-md%-vRR935WiFS#<6?srW|II6^isx`~Fn24m>xmq-umcj+&jJ+01F+XntG zmkvvX&H+McjZ!(JKUgt)=HyNIw@Zo#D_h^*a*S4#<&hA1z5M9^^yz2=@zyEN>6#+! zFn}$l#Xb1{zUw=^WZSB&@R)M!nha*Dr9(NAOIt9u0O~)6;Reurjk~bpQwma@rfVnf z&LG1FvUGxRSKLa0Y_+oW=9uhIhJ7X%lQIA_2RXHdqFK32ss3)O{BMKW7rL`Bnp52d zgALsJKDIsA>5j>>Zf$(8`Q?92L+f!PqyN?rUN~B;VRgqz8ve&QMW*4u_4<}^?djGY z0U1SWl^02w5v-?Rpb!5R zyXp16*O>);@n?ryXv4^n$M7CONe#KV^tb1Vko^gyhcZfc{*Rd^EKty3dd4Q7k<))S ziIuRP&&?&Nka;9K)|DtNdBuM&PQ&0^e0a*q<-jr?F$fZ;krHL;{Uw_ z4q4RclHr|Qh2_6LG7JTiV5a$I_tgHq6T!c@J)Dd5*)xX^;u-U-_`IG8F*{%E-`pgL6VOl z66AFtbF?TD)K=9;+$Vw5#Qq{+ID*s*%yWQDi}D_e%{T*V$t{4rukLL*UNAqcMKll= zt%6`9n2rJZ|2%huFF+|fGUvWq0vQ1zO6Nj*;ti(UaDvU~y;hVJ9^nlL20DN-2VTHK z8F~fAiS_+!9a%kr%x9?qfij3<99xeLcMNNKV#15N2Tp7!kn- zyb9)@evK6JKVFYOCZR!eTGQreX`9Lk-QP0x=SonCF1KK`3saTv*ttJd7&&HyQ4Z*i z3$k(|27wxSi669?bZ=*;AX6XWdVv~#3f5%KhpRP6Rxw(LbGzo#9t+sAy?KB)0~@pi zKryhC!-}lqeSZ#VdO*fifwqK4cf(_-?%gWnuYfvo=hC#R&E}*H_FBl0HQtJh9lKjF zJ7)Lu9QaQ01nAUCmw};J+q&wH^Q<{l4VcOj1XQCR(Ix?yD$~FWJ8!@`3w!~1E^A}s zqcUH%!H_m?CfHiY*dPc?Is|2sS{;f&?!tZJo$gF^D?AyS=GtfGH+Rb~@q<4ri4!_u zq)t-t9S3)pBsblGDR7ePf0h^Vms;>AR7~JGnJXQv9KG z?wSa4l0S(h`f1@NW{aR(wZVVa!5-|@4>NBj#Ju@@HhO#5vh>yN zq>t(r3#;p7Jzjna&_h*g`rj$&BFkNUc}1`HhO|mN*4@gpP39*AFl6UfBDe--2CyNJ zu;Xrc7@?Mu*~+bhmYy2M1Ldoq%{M!vp^#Bu;DT^5kJ~45eP4|MFf~ zZUEAj>ra0{=nat1S~hG3#7QkumL24DF;Q?YV~0;HK6BCTm(sgTM@PkpHsgWRwZh^? z32L==Bo-4y)T1L9??Tye)cQun2s&zPb>JcF$TOi^cmiT=<9KCK$_G5HxcmfNpziXS zS4tW=OBHm2Cv3QQ2{?jkpmZUCl?j+3GxfkKb5cKurc!HH=zN1Z@UXd$T2Lo%A6!U#n$YU9 zzQhwML{~1N@pn4MA&}-7I%UQ%3~V}N->65N(-;0;SU@`8?xg; zop#+0v%EZ9UiRj0868{f=nViX?O6Kd{^!SFCk@te-ZcI5$`VBiE!UL*P3tOU+%NS; z<{kqc2iP^92Hhdu+!X-bSv~YVSmIZqLXfmpXhtlxc!Jl;+bYN=zK+&zhB$Dudv4+Y zVR9q-i3-tef8}I!zuQOZFctnnLld(d{5cpgl-Q7RPz`HH7>Bk4BW`RKj8p8@3YEGE zC+T}t34RflghWC5!?#O568t@!dB6B%ETnvQ8DY`x3EHI{{KX^Kn4Hl`y7e1SQv@bQ z&l4w`1o5xTw}!{71G^_UqVR1%^(UXfGC zvNxC{w9T+BcsX+qM<5Zv0~u`%Z8agm z1<$Lt_JO>WoV%klv3ov*@$AOh)+5#H`iM1#gP@*GS`SL`OLNal8e>#uNz4W4bu{MZ z_PJ3iWV|@)6Ur<0Q6!MWQWNrZAnhtVUdkU?v!k($&lfWpwK|YSx3N!x>UE5Hg>Mq= z*g$PQ*7B~NOu}!<33m#-@V(ub!S$ihMj$w(CnqikwIn|mF1EthEtwpD{MF>~IWd{&NE|~<^@6kW zT@q@iAo)fv)Y~9bA#;ldTBPmbdp;ofoDcF;n3{|XW+ZWDzcuA!@MNwZ1n;UqZM5=R*1L1Q~)-K?7x)aCJ}O zAw{f!>|_&WmYysgp6%RNPm*!y=eq^+eHU}K{JGfSmCpF@{59Uy;>309(FNcX!=~BF zQH#SHk&mZ3iN&8YDW=q5>5}!CG{(hc7Gl$rAdqw!VC==x%&!YF%AdIgEY+aJmkb55 z-WMBJZ~RuZR{BMe&kFBEPM;L}V3G6z)5Of8ElcgesIzk`eKTHX;yFgICORy)!%k5_ z^BUn{YmPvctRMN^@Znv=TSxRI6lJrLK561-aa}eW{qYjb$tHxA{hGV#1rP(SA%- z5#Zi`;b;P05YHE$x%a4dAqSdB4VxAsq@oEP5(%!-@J60NXyeeVYva_+xXR%s&&I`k z;wPqrF|9ZHEpi77k&H3hgm6$PBrNJ`XTI_aXCN? zCe#(BQl@mw zKk+8DK@p&Iy;B;~FzSmL?fD}sML~(r@WOjnPXBxEn?4XeWQti$&?hmY%f=P0G*Nd) z6#s-~&s^l5qrc1Hs@Ml}XTUO@qx>tQBO#EwpIAWLlIx?QhK?N1u}B{ZqNNJ0a(lD_ z^NEswd>}kygS;dBpuQu8PJD~X;94X<9}!)U5vr6Xjf|&x%(p0gTT#3%Sn3!oTm_}x zi5R8M9^G_f!u%ou==6ZG)u6YjN6}dyS5PUc5#S_>xeNlI2#_axN<2L;lYnQb2m zF&^*?**ZT^zj;0Jn@$|Q=@An4npD~C-TCw5_}0mwJYYtLeQzWlMpSBZd_A622WJPhMsRnQ#|NEvQXAmm|_gZx)kU}Di7()8WE_BQ!nWGz*@W? z!MTaElqak7byP31^g9_DK_w?z&~caFrh4`SS}He*YVxZaqJ>LTAdqOiGVq&qN??Fh zxiKd1tPXu!ybY=&?;+MRSi1MxqS4fuHm4P&VKQRuqMovHBy2w+yL8i@1=efKD0Q2y?-YHtB4%; zK5OP5Mfe&V2?bP`2(BG!sZ4QQ@dB%Pk)e=JEB8pagFyd5|0I$ zUR>086)xb4h3#p`@|)QIY7y1vGq8SZ$c8n)>~GX&fw4-ShlbLX<=Hg1N%l6@f>pBm z{Z0M`dSYXLu02iUE7)(Zo;#7*izK{_GFofQJzm)M(xZp{Yqar4h#2Q9WUQoQ`3qupvM_=tX5l)gp$N+^%lva0abw0f+O!!%enhD414UaxRODY3`Cc(Df7H@;%m zNx2Cn{v(EbApZ=1$2tyoC)rGTszpR`&HsdW(Lkl}cldLwZ#YB558YyD!}u6&Da(3$ zTrZ9jq^Iy`&8O<|qVI`|75DHi1{liMT2P(N=1PmEV8}TsgN&fO0_>TTgaXU1%vkb} zC$adglSS8ZrukMZ97#f~MJ6M~f<_d~8Z|s!qQfz>hSBu*T-DQs;=T$jEDh6!h9&df zlys;){FE9bEyH1LFiCUhr->MvpzIfW>-(CJxr6&vZsSfEpf0ake~n*+zpvQQCllhp z572bn(Fj7m*rO9RLfz&r0KtDNW{cN+S9(gL-(?206MwT6FOp`8ev#*LqRelIE!UZ0 zqR9Vh%@gWWmcKn1RER#ad!OJj#j&IKcZc6fCL4)tNeydP zYo`fN4t1CTv0ckhGs3rAtbR^ z&`vkVvx(pQ^gg!6dgDyx72WDC5kPc?AI-9)A0#+sB29M`AN-CvNy_*8?w*j_9M;xL z{WV~bf1!);4h~Llr ze=LA0)q-tA%`Q73@sA%InuaJ69@&Gjw*M(<+OY_~^px_iiZ)E9RGE2fi`A8wWUtvgfRzsw= z=Rf$i(`aE(Eu$n?^4E;#zR?4cp3czNYkg8}vR@3A0Z2vd??@H?T41WjJ;U;N^p{%t zQiL5`02dl=fFBu>ClM8j5()nr&5w2wKi;zmvO+6Gc;VL=+01o=Y?h~V z88!OL-y^vPvp$Q_*=*)&pSuy@>t*NN6E{nS%dZ4g_9Jj*wQST9_ugH?t7?0dqPCh_ zdCTvOmtDj7l-o=Q^!@zyvoRtdkKXfSrKf0keK*4ouV}WIIl)0O$JX3zjDQrUj5_sH zj81*NUclanJNNOAo4g$@*Z0ygNwMq}1gSzIrLH439kyTZ^g}5Sou+8sNfojpt!i}c zS!FNS!tqvO*(DG8&eU2jLpOxnAx`MkGKyEMzmM{pRXyq%jhWdm! zn?M&*)!vKV_6NUH6D!m(2|=^b9b@hmGcE}(%3jKak=4lE9edZqQ>jtnPzcXTD+uO+A>DF14VpR@#4*EG44w~$qtL>1z)HveH^O&KjTh>85rf*ru_JNSXbFTQ-sA6+&} z*7R1O3B6nLz02(x47Ru-MO1ANL+JM^O_O+1E2|s`33u#!nVJH3#Rz*yDA*tG*>t!E zDmFDr2(l-ODu%gUg;ULFwi%*1=SSioWo0K6rUF~5@CH<&FvFzG=$){3iabtx~QNi9j_j~^`mXKEE=;kYWJgPo% z01hV~M}8;!cd=RP(Tx|Ek4CTgVjj{249T-tO~m~WFKvjv+in@Oqht@qg}$^F#41&w zxk19Ms5O(MCSpusdnsRjX47cIw&w~pSHJvc9OV#&JeK;(PtkD

&}-SkQM{wL54O4jz82Rq%eUX%2S?BK<%#s8r+u^pnr_8RiZJ-H-sJGn0CI%YC9XfVpK7p)0;N0 zO>G*snWPX&f-$$vw_cd~=n0=rRY_Cn*PZ%^rLq$E;S>Zeb^68qmd^4yaOWqZlMwR; zC}d#ZaMjE^xIW&Wt(;qoZL#rw)is(>BG;N4{3~nR@Zy_IM*Qe6g>$<|zNi-*A0q^^ zW{vH9zM_~+2x|smGyTDLuOo6&!2HcJH!ZyH$+l}NtUXU=+xShDb#a-!Q!7h_gnCL@ zE%oXi3AyiWBuBfVM3(jNT<7)$*Jn*vAUcXAB<)E)BY5RfT4BA7e_VbAu-pw^wOxK# zgq6u5FIZpju&cF-c~~9ed~Eh8!96KMDG-y4Y{F*{gm%t+ZK{PsW_+>f6h)7Ry@mvD zJAq6O(>J0yR7#&s->mT=Fo8ve7$KBnVJ`xHDpdJHUGFoAqK5G2ftexgw-#bn2(3dq z3w8JNIbzJO=^}>A$+ej8kZR+|8i}>EL)nOD5?Gi_d~k2aO`@uh{fM^1nRY8^1gNnM zpQ)J$(-=6T4pCOz=8B{?3Akw{bMX*SRB;=r?+a9t4H`8XJbXHQt=pij|otctK#-lVa>>H!M?H zx=D?fo`yaFHR+?5))Byo&3c5IDOxA@xo?%jc?QxFxlwX6vV~J7O595m@Eqi2&AijI{~X z^q-=5eSu!N^4aYa?r0oN;UZE)-9Cm}P}qYKD3_MxIYG`osrwQNdmLzvpW~7mHCr@z zw3_)A1hh@a2*lvSB_f}lf%oe<2+9bYa}_2TqRyG`(eB~&q2b-Ds;x6SuGQ?QIB?JB z1ia70y+tG|T*l4OhpYX}gV15f)ca9C&I?S3yq@TNK%bkWu&GxY5epvT`$Btet6H)1 zFhD>`!OK5@qdBX6DSkt12W59Y6O$)Xt#XtkH5XLOi>jspvn8~veVS)2J;_G__gjJK zwaCgW4D83K6*XIu=8@*%D&3(BDzZy$bB+O-Vwf1DV=+D?ilSWAHK6 z-k1;ik57dDk?vnY?|%0K5=S!*dUozG{h{i-Y234%ZsW`GSqD1tVv6ALW{sXlcZ%LI zUTAO|n+gGzuIt!gm*IP8iqdnvd|&gbuM?ZjhOqU~GLYmt=qg>2Kz(TO4(d#M5U(Kg z*Q@B>Vy*8zEIY;NstyW7;hxVYp)F(-y)dj$GGKn{-wKeH`orZuCQ!h5qba{Uc77Ps zUPcb${@k090Nt+0q4~SExVY01sk5l(=P%}edPgF9CvdrUyl#V38e}aBo(uk&SPtys zZ)7(dN;b?4XEWu_q>PN8nsi|pzPU?e0iXI23BXSV!TWI8o#6{9yk<>t~7JdS?JMpr>fd}V^K_ado;qOW8%O8QlJ@ws? zuneU497f_blR{hQHSg{tA?IUz zFC&M(C+5t$v-}?!assm0>}Y4l#Chq!O@E1JD0d#QQ=A_nG+D`)7ZO*vYMS!ISxZ#B{tB4DT3MXp#(aIL^6GNorUhAOJ;?H;4LnP^Oh{&-Z zT(bSAF}|I^%C#bvdv@86V2cZYN+-CcsTgs4b&cXuh>9n#VY zNQrbwN_T^FcPsH9)3x^g?c?|8U55)l&ok#7^BLog>;7JiuBBXq``z}Icn#swgdMYS z>Na*VSsc$uJSrvKXy=V&#fQJ{5y^VvWQ(`yeW2?8V1LKuzT49k==aNSP{^&v_wnOZVw3odwU%$Fw)|7ecCV4sH4<%tf6~;86GYyX&qb`AoYLj)~MCCY|aWqT9^lydjuz2Xfw?m`$B#wZ{eu z69Tg#p&l8&m~jHtfP%1YdRnPtL$Fp*!ZDeapOlJo4eWN-J^30Jv5mGGn?Az_+F8Q} z%uhDq?f5d!%=S!LmJlhY&Rvek~%O^cM$E>RmtV)J^4nxUd`a)?n)0dF2gpViK;75Sk?Ww*tM=p z&v^%cUDO4crxC0Tp+)96H?lBk-c;Y@3_NDi|m^%3>dXe=K;_rOnlF z4jN#Z@FV#jFa(hTpzt*<_D z$Q#kgl!!{Fc566`&~u%plCn%Ie}3L*TQ?p)0=~9@5`ZQ_tv2 zK2-aCk#Z@sj9O7a%{}pXuaH`}fo8!B?UGJ=U+>>OeRp_U{?64qp@g9Sw(0v30vvXR z$q8$>lXu9x!i&nmDR&^e3vlEbwn86u_{ zmL&IRkf^a-)1pQw=XejLr4yr>(-rX15WTx=AP28*SpWVj&%pDl>yuoKtzo? z>6ec$t zbq~~*Z6jD}v0{ctI>3-8N3@g?`HVewRkLG3k4cJ-bcDZqbx(H?`-Nc#!GA^=p~;96 zEO1%m7$U4-7F6`R1{X8=*Uip-A} zL4htn&OdF0chuyhHYDR&RVT#yLSCA{%>Cj6n%>{_pUCH_$HhG zOie;xDF=P!%bWdo*cM5wbE7mUW*xQ*LbKcXUWI>oA-J>B%vK7fJAVY$l~PTKZ5igj zvM!XE#AGD%e;d#uko$OhH#gn#mX@x*6{r|mS})i#Eafz0P!fhnIc`#sA_)Q@9SGSD z8JK8Hof}QgC;+OLwkB%!@~-!oN@5M>ufC#NTq!&OU2=3!#Z3a`T%Y6i#|gpBB0x`p zI^rV;@4!GG_fd|dFn1jvYlp4*`+<#3&fO)E*0kbo;L|>jS{%k!V3xT904b3OuuYpE zd8}Rya^bL7$4cqnP_h%;-2hbuscw$yOVXJpA8Z@dJ;YK>v0L^7!RcHoOQ+ZlU4W5j zm-*~?xe~dkvT@D9XOvEG%9qvlhV1pWY~9@jmaTd3hM~v(4>ILK82!^JeKvwpL0nq6 z2r2*jotTH~nB|jbw}FJK>lcRcvtUjwM;W1;&QGYZ%QQX2CfvF47wzIP*>i1I*8YMw z=B~Ysj7%(iyV1KeTZt$OOJb*aYUiQa9g=XdtmWa~Q&6$0L_ycRooJYpH-1!crWT8w5NLsdp z#3TVzHyLqhkhtc2*Aqy06l%){H!`jbQ9?eYT&TPDbWcR@p6%Oga>XwLy$ER+?~{Xw zgkPDc(_g#4cxP}S6x$zrm-6w>8sp-1gc`CKbneoB8w3-bJ0wJJ(}wtS28URKH;ll& z2a`JrJXTe4Y{Ti3xu4Y}nelZZ{;_n}xe1%Q4qNg@gW%6=I zhdyk!!>lXP7~urxFthTk?1OG+;m~z0k<{Yzg-ch%lQkGW!>H_-a474VAeX4`;zuNo z>pcr&$Rf(_8N#@PnUUz=Ew+BC8NRLBISrt3;$KautW`~4HW4i9*D8*Zr|r8uQ#_s5tIv4ZPm zc0NoPrt&0BHG{Hsh#h_eE1a_m6g|{_k8!0|6fr3OTyBQ-ezW~VyDG&=()ITaSS;uV zrUdm|qT0*YwlSNmx+6=mbA$wdSuS>dxw_fZQ!~1KsZV!=AJfe%$Z@AqR)*V{e$Q8Z zFuGNF2#rkSKG4CWn48TV?ADW)Hm3CmdG1c@Z|^4+pQHSibI>CV&gO)s#Mv-Ne8$Jk z9h+2_x87YL@^cx^Vb2du)sT1M+qd*n`4;7jDu-#z`bH{87 z>~svXKQk9wr1~8;WS=jhBFx8ZN9uFyQq7FXtk(Vr!4Ors31)&rq*w{bVfBk!Non3U zL3`=WYWdEzri-nr0oknQj{gOmw<(j$q`MEPy`Zk5+TQnm-VA&6^Bpsia`1gVdx{H> zVB@iVcaro+eVSmoy{srPjn+5ISJ>4ErMAx_a8a;(0{WLnzZ<^&#J6g>=*ZO?1uU>kgX?)2_sC*x!-Uswhv z@;m{ZZ`xz%^=ec6aZz_CAUPG#kQvq!=>Nm6R;lN;yAE@M*4N;RF999Jn)yYMo;MHo z{HS{K;CTD|@- zESx94NOP|+NMbx^;dPrmc~%d2DI3n&Iqf1SYHkZ&#ynk_cLz4X!A&+KyjCk}voaiN zM9o#rejt+Z>B3isAWHoUzJn?9b%-S8)fm%j=ZzViM8ywj?k7mx{0xtyZVB}EJLUBy z@m%6mh>5YotlP(mK{;g!*XoBkQ-%y{@%8<01>$PCCkc3UMoR_zl~pyhw?oM zGS}_(h%v2TH**?7impoGU!vI%|4NiCgllkWn3Yj&xDoxC9N5zvkC&fRzgVFx6=L2c z$85K)VmEshD&c^md}PnU^tP0JhD=NVaBX5;DNHEOH<=wect##)!q>g>6CEKO?$p7e zRH@2v{1r~LUUR;udnH(vljJN>=9p4dNQ08Q==X4!mfn2k=NI`y028VI>69e~aRfAi zIiy2*zt}AyFteC<(;G9Wxw(B0MmXX^`%*{<@-@FewBo7>{Yozu2lV81O5kNEsri^4}l= zz?9Db=k?Ew86XNofT)sh)n_9~eTZ6@k#AWm5`mDHw`6Gul%Y|DD~C zMMwrw=Yd}%>4fU*5UrVO{=7c^=;>!}j~#~aSux2&qMR~p6&-CS9SBpRP90Vlm1aH) zHvfE=k}mi71;3g7?{OFGQoqKI&L`JE;J0C$8Fvh6?NiorA+PFc!J!N(g2GWUsYVug zFQUV$w;PxdZx_f{Aoc%^Zc^z1N(o2_qWn0Jfne_${Z}US^2q-gU#Or(ihOZjXz8kl z%kWldvF?@>zsO@s#FjOq#^D;DIGTn}X-d!}f?ANeqa>&iuKpbgB^kYIDt@!G0MQY8 zMqnLl>E4pSa<5W9{OuNKQ6a5x!`_p-R`Y6%Wmux_OC~YkI zm;5z?31<}7iagFL;h5)-TTCKRK_4L$B8f*&K(pmA3lkrve>ULnt{_dl^=r0f-*d#5l1FAHkyUv+~4 z`(#QHfgC^`-A@nz4trzmRW?4lN_V&f`=2)%NqZvP1EQEIts3VP zRp@tnq&;-MX_o^qN){~cKQnlT>Wgpr4q^JODIsM z66eDzq9Vk7Zr@a;F{CbCKN!ZZYZjL1s2Z8#KB1o4(9$X8?}h%%^UpF;b(E_xEQbHq z3pHVR8jx2!bYho#XZ2?`g%OH zE3O-$O;=?JG~G;IWof&q+ zHx}KEOBbyBoWbyM-5xU=l|N@rZor^JrRgBap|wRT!T?V&c|?iTsfW+|^5zQc?uE&M zUpON*^`{|gcTF9gvU?~IUA0!XoO)Mtzu*++Yd+rMZL6-mJKriD=+H+Or2Pe6OF}Z* zcxfRZEG949{3b`z{5{G^(J+5~u&#I2aADDy*b-@DViBd%_S4T3wcZ}l%SxgY1j@L# za^LV;$7@#Qc^~{z*ZMB|Z%i$?*(g#3;1=4cXSQqBkOPJB-rpH<%u)VY#UR#o4|1${ zk3gtt8+w0ubkAy6wb|+=6|2ZV(;c3F@Z^=K+DjUWxI&gTk>b&&5nuGe7bEWDGU@1l zDtS%UgRZt8y?B4eqth)`neHI^NKf<>aX>r3rlO{6So^We{4d^jbA^-UR(i%3M!CyU zHtWYgyj$U(+rvGhSoh&sqhb4d!(6ICEH&onXlsref*T-lOlu(*!En)Yj_i&J%_CEw z6>^fcoQQYH-kE2~o3U@4LvZ#h!A0gPY&I_DFCSOsex*bd8$p}t?zK~p^6B@)e)2S= z{4TFyZ4ES90p`dh3f_pIbAL%tV6|PzA(=KYZ~j4!<_ z9oS1G&JshwT4Ns#E%&IQnD}A-gP+@fC)|?J(koifx5l7{Gj0A?`qM8non0C~i#vvz zC2VQAjyh@yHr;E`iQFt~n)N@5bD2etS5mFl-u5iUr!lrdi;}iIaZNIds1Rep)6$Lh z`V=h6C>%HG=mk?}W0?t3wK4jt4i0Y0%YNQfop`wsxu*X4a#*_Q&PJ<`k=nIL6UoHJ z=C5PT=YN>PTO{TTy`Ylv(A^j44x!!cI`ruZ$B)*Gtqsa!^%AibvL2g9+B);6+qs@& zZ;7AxH1OD{3ybRi{N_uj3b!Sm%bHbpY2pH+g4S*6xs~F_Xmb9)il%os`2Yv`*y5-9AwiGfLJGdjRao3F)?3At7; z>e{YU;YmchqCC#n67F%uL7t4pSC-*&gH?{v-R4mWT8$!!L9}%xP8`LE0l&Xh8DLY9 zG=&IlGM_P`75rMRm8zuMiCJ1Qn5nRdFknr$_Mh!9VwDo}R->YBIAP|YPg&OLlldXZ zr7Ud&8(Spx_8;F2+O;K?X6$Dw zLcH(8aFH3puZ+Vga)j9vL^#*R8_aAfny=D-&Y=0@WO%rkBfpq$-_hRBoGSjJ+{bE$ z#Fb=J`u3fy{*yQ|pCdIx7#XVnPP{|S9b8HM=jZ9l$hFtnWiLyOeBO43&%oj{ctXt9 zC$DruDl#y$&yh9%0ujL#zH>yNmugLE%Cbn#z7#pKXXBM0QQlGj$!n;j&w(y!j_WpQ6ic7vD!g72tWk;^2Avw z`9t5*o0XHXHrT16dMLSm3uD_lmB9G4=;YZ)mtv35u9hU5#laolk6JbHEj}GM=W5W$(}OgD+vyX_jg9@%5&XaEnq(lfmDat5jbEX^Ym)Xanvs9vLtXnWBZ~ z;lCOb9^;@?V(lx$%p~5c&wL4r*ljLdHB;QXNV~wrj!-vGhq1$N1Q7aTwms>Zy-*h# zN5YJb5)a`wxn>vhI;~O!-{CH@hX(w5|C6r$5XNWNYv~7DVP3NX6N%TFK4occYE1DV z7(TfMgCW1}i3y7tU()C}?!L)89;!7_q#j{_yR?l|zFMvGg0Hs9_u1si=HUD@kOkP^ zQu+ck?D9A14+c+C%Oqgv`0~aNwWPw3M*u|5|L&+RXL|BiXEZ>Lv6(Y?DkDf-_oFX# zV&gb@<6GK@GGi~6D=~WIuh)N!JrI1n5&O9kdqFywkfnYJ3}L>)H-E-uNe3_bHZ`4YA8Ce4Qk2~$(i;+uQRO{DW#-|d7R5lDzN!VsWJi7ZV8T`|(?4Kl)& zlAI)FgE=)cF}x0>eB4hTMl*SJqSX<@n2TF7X+BV_JdA{h<`<8^jBem@n}4=m9uYnO zDL0!(pMt#iYix{W1>+(0T%=DmXI@NBch-F*&qd}BkgX(bj^`ustqPj$@h45eNAptwg_aEJD{+3E|19CL4@ zpm3R!xH?yb9Z^Ika)Fj=G7gbf@h*m(={I=sF7m{a?>_HoR(+c(sHi#I2w9phi|H&l zF8APaoWgD^=BUueTq%bW{^aiyCd4SXn&~<^IMmD5U4$4Zk>t6{k#r4|a z%|^HLzq-NO#I*V|B8eGCX_A}Rfhw(=0o$JW%pA;(N2mJUlk}Y{y5=Q@HF1o#3HHtq zEGwstCm?QhlC-tpe;|_j*o8LvoP9Ob*R-es&FPZqwKKDNVbO-dSbU%rd`(hpi93o)$eNF`~b>5!D8&N+qVt{7>gdFyX=$~gFqva}jwHboNwbgtbF`#C%!cOUrM@&B zhDlvOh?`VWWwm+z9VInQU*kT(i^}3gCi%WW$&AVcK$A2sQZ_aB07Y>lo=*d{*X}}0 zeir?%3l1{74(I8)OlnEy^<8dYWKnD_lFNPuE4sN}V`t)eF3lc2@SyYu5BQnlunp9g z^w}oF|M4r!*_|RV+L!CD!SW zl#Tc*tyzeDddX#~$nK8k)$-ep_2geQI+;x2A|d`C%SWR4c8aJ(bE5jeOZWFY0te zM}mIknaVkCszsf&Tx-CKM%gBD2UT0x2y8@|rf)F=FBvSpeTv1ivl_9b*ja}LBgx$s zB<|jr=BRO_b@on0dtu8nej)8P+4ypvhBH4z6jcy|{`}b>1Y*|TEBns6;f%O1Pko!B z>-n^-utgk&B_9&8^Q-=FN79LYjSgsR4|GANIxY%NbQXUD64Iyy3|S6KfI(1Rs7@>h=2Z-XcH!pe@{L3+i*xA zt|)UHRZfaJTulV1Oe)gFy6>t-*=xyrOVSifj$?{+^G!5PJFhWhdppW`18Po6#f#fm zNerrxl{-47lgywLb>dV^gnhE=rC)B07%DUPoQ`A$BSH-ssBBHKtRa?j(vc#0k6Z)- zkeyKSkc{MK703v~matL=QImIl9|^*L^^+jeibU3@wR<99*7cM84Oe|{q0U+ax=e}f zM|i9$a)Jv2gaO=nl6_GdvQ4d{_g0nr$0#nKcTRaA`jgwuY{5pM4)MAB-_4zTCbzEF z>1ESAds0<`yNuzN$d~6!on9k}V^(j#|0Q|bVJH??fS5(Nl3#?(mHa*X7>xSzL&6LYzW+XnGEywL6o!yJ< zNj%p$O{AsqpcxZ<|*RKi7JTt9=GNizwu_r-S*;O)BYNwX+ZP_^7$@#?PSzjjE3(zCk8TE_JKKLfO z2Po+%-SPg@8jXYx@T?>$xc5G>p<5*bLkoAkjdw-b-e0N$O(I}yUZw+rD*AX;7mEdW|sARFVaA`dMTb=}^!wN+#qUO6fPFU8)znEx9 zD#Hwkf>N+YjcR=F#lXb)+S|wvu&l4G9&Qz8D*k@HDH729 zaGh6Hm7Mv*s9Sm0)-8t2>nn{eI&85UYu|EVF7}H;+@5?a*}I$k*db5T$FPJ4knQ2h zWrUDJd5b&loc#~VueT6O3orRF0q{*oN~*fO?udtB3M#1Z*b~_%RTe1; zw1WE1;)#5l3%o53N%1kA^1>AA0Ll<@5l(5$eEZ@H#BdNN_>}U#({B4{?}D+(TNnAM z)?!kl7pI%{PWSn7^=^rD+X4SVR^yWy9yd^k=^(GOO_#3FDUxV&p@~Mf{;Gn({m!iE zIcMA!Ku>huy2T*H@0`vyWG6?0?V7Kg9or9#DnA@7-yeWc7u_Oa#>fQ9jb#uCV~U(( zQNPsk_fmoD)BVGfN+7YO<5M_zxA2r*YT1?_jQY|A(b3JJ3IxeqQ=rW@lq+j?O!0nJ`n?E4 zOMx3eLFrG6$E+eXKNIbB6IQBMzn8$7F_iId%7FwN&-Q|;aj5*n zbe2i?{cn79fsJ}~m*tj+yCXfCj(`Bi2eiBIpm3?E^N;y+wAqEpfiHsPFDu@@joAmNhRD_Q87!NZH;a~{HEjnOggd>$J||4wxy4C zFPi}+(&H6VNb6$Xfv1Yn0>Bsy&&aqPxYa%1n}hFaD44g=B&%u|)J(-C8fx}3RviMk zs^b+*zf`v}_PRXZY2FqSGA+nf=i>c`vf`mTIzU~xf&lZ4ZRIvv9kRuzvESMeNjnRQ zuNItQqWy#B`^!(1J;xG~E50e{x{zu&-Mf%8q)yP;o#F866RdnD{Ic+%+tHrzMwtdK za=2dN1>)+Z?Qf^7f&M5;%>k3AY)MTCC|*I`jeqt4@tEUrK}H$m5c*uSdA0oY7Nk0HO?dw@HkG$V-#-#TTG+8Rni>c>k&J za^k^midC7-y&ZJ{e={5}s~KXuJAc25q6xIvd9yXg5vCK}q3q@EwSnmkwh@`pZ?WR7 z03kX`>|Em5kXOeC`SMzDE}Rvzi9QyUOPF6RYUDGr(*b*Bci+rBcNad;MaZ=lx!aqJ zo5I4T#ArPFRFJp@NBirm_Xd;sE`7T7E6<9aCtjtx%?-k}j~4LX>h{?J__?lO1L8I^ zlt2!&1NPei9QTkJby`#7QA;C@y;+tAy=%TTmNSd-M;eH(wz>xFh7?o$g^*t~Ab#&s zcG9CCsRhN=?mmQ%+=FihxS?zCFgKYug$Wo!R}bEU;3GVX;TnR+fegY1ASq{!u?d(n z#K>KRXAQShpg37~Nk4hLT~1e~Hb?k9H#%VkB!LY})vp+`Hjg1NN3(q;k!^{snU;5^ zDRJ;W>w)#Dp}!hceeCGBkSRa*JEDM@o5M_2S(kk|8F_j6ktB-uB+Qds;zu*wMUkwqv`HBIMA6?qr4&Z2nC^Z7%xi>)CFrPp= zvd;^#{QdJd;*;9(5FKw8$q(3Bt?oM5u_%_VrdfjIF`V*n^XO51$g9oQkJu^pAFG$X zI|0&OyvVNP3KrkbBw{z+HSJ@(O7zPeC7&G%SLA<-yfJ!8$`yqKazjA~*Md})Z-8hB z>RWs7%cNQ}`Dl2kS0cwYF1-2_bgL-uT>005|Kb~L)^-WbjYN2T9VQdB7+J2qS_h>e zQ)z$+n-1zr$9*eHJ}%&S4X&|_{y-<<o$IS~V{)jZx??mc_DyQ~h0q?aFA5rA;hND7Z8xz%f`!EG{I~+{wFdfUVQ0_is{Y29IB+LrjQ!k>VNZ5ROg>=JRgZP(vO1sr$o&KcVU=x zua+Sj9DF{9CCMCtFEzH?^}!I*&_O`P%u?mX<@V zRR+$8fYT1i`z(Fg8P)7aw|=5(hY+^@O=|@D_6R&{7mF{ShWPa->GD zHI0g1AK$!4-xLrew`oJCH0U!-XA26YK&^WKV=RK&4C9#Ti~l0kQmsIDVfbQeaOhtY z+Q3tA4sMCU6t=|B3DaPhGtfxC*?vcG@F38BTiVzD zC)aGtpauuMl9;tC)4yv|jt$mkI-Bo*E+hY8;_qj0P})OEc0hF0LG|tC%Bu9d*e(Bh z4-ibg7wUfB_U3VJdTC-HsNn-$V01f4hRqLe{MCO8kW0}G}5Wov)zwS!1f0qFC z|NeL1z3+3+3Oyf=L&R<|AArK}T=^l;Xa*Sa<=>72k}#m=Y&})>_<1p{_ca{g!9u(r zfBbo6T^qFzKcN18t_W}F3tQ9*TEmWTDp{k4)8Q6?LkcK11Jsc;my}wY;P^{ieH%` z;sor-P##PdK9HKAn!yA>lfg)3WqUx=m823)NyR${;X!9f{5o*6^dGduBi$39%q{^v zy9zTm5vE$Zs$X~9!_*Tj2Z!F^^6j8f_@67khbzc!rn|k#eh^?qIb8)$oFGWKu3AHfn*BM#tNSNZzzK=-#V?R*FRQ$i z!aG5PMLFRTJL8(xc0!lHYNmptX5apzzRmhffqU6B-A359W>i`aw2_)@5`vMpIjN5? zTk{ZJ%XgpBbW*rXs2V$P{FL|vc}vRg$5ASPbe2^YoClCp>C%0o>EZt9p#_i$P6(c) z+{y$Ue+0VcNUour{Db58jzQjB2%ekqGaSFk}C7==~}yr2MtRe6s91M^5x zPDFML^_`!0s=OcW>2|faE@$Z0RiBqs0Oe5E*268 zM!-`R$s)hsMSdjT7&++lN&kS-hqP#%jdS8|T@wkliq>$htByjRsro5s11>W@fDOJN z&sK_cat$KlCA9VpPQbg6wwcr1f~$PhF5iR%H-4?_m# zH**p~Uq;3A!vl~q71(crMc?we$YWC;TM)}N(38f9T)yj^j*{Mi7%oDyz=MW8AT`n# zK#iu5yRl8b6wH8>s>(Jib%$z+OxbZRgV$y^iqt`Ks^YzRDmA~tBW&LiTy2R4|GckW{qQTB}!U8RxX z?`Id;?#9^4peW22s8_uKsgZ)@6Zp+0r{(Ua!aRm0Y8(|`C%7JQH|?gA0Y^!^-+Sw} z0UB{9rMv$448!E$ONb?xq^=ug8T@80=6*XST#A!i3neb)I3%1aww91d{MAwOqoGs< z!^V9aOjv@SGEZ%+W?Q~C8~bIJX@d7WB2q{yjb^|w;YnPrz@WK*#!FFCo1)gwbzl?7 z$T8x=lr-D2923=sPbJ3x zmVn<2<~XOR>Cg@P7g555eWY=sz#bHi)zrj#h^ntHa_ANy2VM9RA7MK==abh$ODHDA zl@k@z04A%RhZcQti}dwO#F9K`tY&sXm~sp?Op}aVR7QvvxtvK*7WFvcr$l2!_^=f$ z@Fve#Xk*BkYDS{c*nm$I*5ch6*|WWNM(#O*lo)3s@iyxlMW@;J{R!`Xa{1w_??X#3 z@nAc&ZGx8G^6@eIZm3;t+kel0DgASB z>^>1G)G+)f000NEEeI=ONLQF4Tfx791d#}cD`a9Ul2iYQM8K>FDF`gqr1}5x+<`(3 z=@<}Mj2eBU`>zk{8WdX8VGo`Dw}>zb6%o!;O~d|;M#9?apy1;F!{zKectfA#Q(U4DgtQ&A(m5^hcC}vdWf1hWUOGp&)bYWq zT+GFl_20Ni3A*$m3f{(~f5WqBD1O-+PS1qu9^mn&szKq|PRO9`f5I~!C_MXpX1evC z@C=%yb^2{kt@WSqEDAit`unEY`hS;4;01Vw8NM_7|JXV}aUpnw
+ +### Event + +In above work flow, a pair of events are needed before and aftern the piece of code to collect time. So the event has a flag to mark it is starting event or ending event. There three kinds of event: + +```c++ +enum EventKind { kMark, + kPushRange, + kPopRange}; +``` +- kMark: only a mark. +- kPushRange: mark the starting event for time range. +- kPopRange: mark the ending event for the time range. + +For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, a event lists are used to record each piece. +```c++ +class Event { + public: + // The DeviceContext is used to get current CUDA stream. + Event(EventKind kind, std::string name, uint32_t thread_id, + const platform::DeviceContext* dev_ctx = nullptr); + double CpuElapsedUs(const Event& e) const; + double CudaElapsedUs(const Event& e) const; + + private: + EventKind kind_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +}; + +struct EventList { + std::forward_list> event_blocks; +}; +``` + +As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or distable the profiler. + +```c++ +enum ProfilerState { + kDisabled, + kCPU, + kCUDA +}; +ProfilerState kState; +``` +- kDisabled: the disabled state. +- kCPU: profiling for CPU code. +- kCUDA: profiling for GPU code. + +A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`. + +```c++ +struct RecordEvent { + explicit RecordEvent(const std::string name, + platform::DeviceContext* dev_ctx = nullptr) { + if (kState == ProfilerState::kDisabled) return; + // push the starting event to the event lists. + } + ~RecordEvent() { + if (kState == ProfilerState::kDisabled) return; + // push the ending event to the event lists. + } +}; +``` From 9d73950ec9ab7fb14c2ca2f8128f0b0944b5ed7e Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 14 Dec 2017 11:02:57 +0800 Subject: [PATCH 02/20] Add profiling tools for fluid. --- paddle/platform/CMakeLists.txt | 3 + paddle/platform/device_context.h | 12 ++ paddle/platform/profiler.cc | 74 ++++++++++++ paddle/platform/profiler.h | 197 +++++++++++++++++++++++++++++++ paddle/platform/profiler_test.cc | 98 +++++++++++++++ 5 files changed, 384 insertions(+) create mode 100644 paddle/platform/profiler.cc create mode 100644 paddle/platform/profiler.h create mode 100644 paddle/platform/profiler_test.cc diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt index 88df28a966..9fb6cd0de5 100644 --- a/paddle/platform/CMakeLists.txt +++ b/paddle/platform/CMakeLists.txt @@ -30,3 +30,6 @@ nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context) + +cc_library(profiler SRCS profiler.cc DEPS device_context) +cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index ef5f19214d..2b10cc5df8 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -103,6 +103,18 @@ class CUDADeviceContext : public DeviceContext { cublasHandle_t cublas_handle_; }; +class DeviceGuard { + public: + explicit DeviceGuard(int device) { + original_device_ = platform::GetCurrentDeviceId(); + platform::SetDeviceId(device); + } + ~DeviceGuard() { platform::SetDeviceId(original_device_); } + + private: + int original_device_; +}; + #endif } // namespace platform diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc new file mode 100644 index 0000000000..40b34b732c --- /dev/null +++ b/paddle/platform/profiler.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/profiler.h" + +namespace paddle { +namespace platform { + +ProfilerState kState = ProfilerState::kDisabled; +uint32_t kNextThreadId = 0; +std::mutex kAllEventListsMutex; +std::list> kAllEventLists; +thread_local std::shared_ptr kEventList; +thread_local int32_t kThreadId; + +void EnableProfiler(ProfilerState state) { + PADDLE_ENFORCE(state != ProfilerState::kDisabled, + "Can't enbale profling, since the input state is ", + "ProfilerState::kDisabled"); + PADDLE_ENFORCE(kState == ProfilerState::kDisabled, + "The profiling state should be disabled when calling ", + "EnableProfiler."); + kState = state; +#ifdef PADDLE_WITH_CUDA + auto ForEachDevice = [](std::function op) { + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + DeviceGuard dev_guard(i); + op(i); + } + }; + if (kState == ProfilerState::kCUDA) { + // Generate some dummy evenets first to reduce the startup overhead. + for (int i = 0; i < 5; i++) { + ForEachDevice([](int d) { + DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d)); + Mark("_cuda_startup_", dev_ctx); + dev_ctx->Wait(); + }); + } + } +#endif + // Mark the profiling start. + Mark("_start_profiler_"); +} + +std::vector> DisableProfiler() { + PADDLE_ENFORCE(kState != ProfilerState::kDisabled, + "Can't disable profiling, since it's not starting."); + // Mark the profiling stop. + Mark("_stop_profiler_"); + kState = ProfilerState::kDisabled; + std::vector> result; + std::lock_guard guard(kAllEventListsMutex); + for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) { + auto& list = *it; + result.emplace_back(list->Reduce()); + } + return result; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h new file mode 100644 index 0000000000..2242635024 --- /dev/null +++ b/paddle/platform/profiler.h @@ -0,0 +1,197 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace platform { + +enum EventKind { kMark, kPushRange, kPopRange }; + +inline uint64_t GetTimeInNsec() { + // using std::chrono; + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +class Event { + public: + // the DeviceContext is used to get the cuda stream. + Event(EventKind kind, std::string name, uint32_t thread_id, + const platform::DeviceContext* dev_ctx = nullptr) + : kind_(kind), name_(std::move(name)), thread_id_(thread_id) { + has_cuda_ = false; +#ifdef PADDLE_WITH_CUDA + auto* cuda_dev_ctx = + static_cast(dev_ctx); + if (cuda_dev_ctx) { + PADDLE_ENFORCE(cudaGetDevice(&device_)); + PADDLE_ENFORCE(cudaEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + has_cuda_ = true; + } +#endif + cpu_ns_ = GetTimeInNsec(); + } + + std::string kind() const { + switch (kind_) { + case EventKind::kMark: + return "mark"; + case EventKind::kPushRange: + return "push"; + case EventKind::kPopRange: + return "pop"; + } + PADDLE_THROW("Unknown EventKind."); + } + + std::string name() const { return name_; } + + bool has_cuda() const { return has_cuda_; } + +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event() const { return event_; } + + int device() const { return device_; } +#endif + + double CpuElapsedUs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000.0); + } + + double CudaElapsedUs(const Event& e) const { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(cudaEventSynchronize(event_)); + PADDLE_ENFORCE(cudaEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); + return ms * 1000.0; +#else + PADDLE_THROW("CUDA is not enabled"); +#endif + } + + private: + EventKind kind_; + std::string name_; + uint32_t thread_id_; + int64_t cpu_ns_; + bool has_cuda_; +#ifdef PADDLE_WITH_CUDA + cudaEvent_t event_ = nullptr; + int device_ = -1; +#endif +}; + +struct EventList { + constexpr static std::size_t kMB = 1024 * 1024; + constexpr static std::size_t kEventBlockSize = 16 * kMB; + constexpr static std::size_t kEventSize = sizeof(Event); + constexpr static std::size_t kEventAlign = alignof(Event); + constexpr static std::size_t kNumBlock = + kEventBlockSize / + ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); + + template + void Record(Args&&... args) { + if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { + event_blocks.emplace_front(); + event_blocks.front().reserve(kNumBlock); + } + event_blocks.front().emplace_back(std::forward(args)...); + } + + std::vector Reduce() { + std::vector result; + for (auto& block : event_blocks) { + result.insert(result.begin(), std::make_move_iterator(block.begin()), + std::make_move_iterator(block.end())); + } + event_blocks.clear(); + return result; + } + + std::forward_list> event_blocks; +}; + +enum ProfilerState { + kDisabled, + kCPU, + kCUDA, +}; + +// The profiler state, the initial value is ProfilerState::kDisabled +extern ProfilerState kState; +// The global mutex +extern std::mutex kAllEventListsMutex; +// The total event lists of all threads +extern std::list> kAllEventLists; +// The thread local event list only can be accessed by the specific thread +extern thread_local std::shared_ptr kEventList; +// The thread index of each thread +extern thread_local int32_t kThreadId; +// The kNextThreadId is a global counter for threads, by the kThreadId and +// kNextThreadId, we can know how many threads have created EventList. +extern uint32_t kNextThreadId; + +inline EventList& GetEventList() { + if (!kEventList) { + std::lock_guard guard(kAllEventListsMutex); + kEventList = std::make_shared(); + kThreadId = kNextThreadId++; + kAllEventLists.emplace_front(kEventList); + } + return *kEventList; +} + +inline void Mark(const std::string name, + const platform::DeviceContext* dev_ctx = nullptr) { + GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx); +} + +struct RecordEvent { + explicit RecordEvent(const std::string name, + platform::DeviceContext* dev_ctx = nullptr) { + if (kState == ProfilerState::kDisabled) return; + dev_ctx_ = dev_ctx; + GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId, + dev_ctx_); + } + + ~RecordEvent() { + if (kState == ProfilerState::kDisabled) return; + GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId, + dev_ctx_); + } + platform::DeviceContext* dev_ctx_; +}; + +void EnableProfiler(ProfilerState state); +std::vector> DisableProfiler(); + +} // namespace platform +} // namespace paddle diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc new file mode 100644 index 0000000000..ed64ff40c9 --- /dev/null +++ b/paddle/platform/profiler_test.cc @@ -0,0 +1,98 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/platform/profiler.h" +#include "gtest/gtest.h" + +TEST(Event, CpuElapsedTime) { + using paddle::platform::Event; + using paddle::platform::EventKind; + + Event start_event(EventKind::kPushRange, "test", 0); + EXPECT_TRUE(start_event.has_cuda() == false); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0); + EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); +} + +#ifdef PADDLE_WITH_CUDA +TEST(Event, CudaElapsedTime) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::GPUPlace; + using paddle::platform::Event; + using paddle::platform::EventKind; + + DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0)); + Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); + EXPECT_TRUE(start_event.has_cuda() == true); + int counter = 0; + while (counter != 1000) { + counter++; + } + Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); + EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0); +} +#endif + +TEST(RecordEvent, RecordEvent) { + using paddle::platform::DeviceContext; + using paddle::platform::CUDADeviceContext; + using paddle::platform::GPUPlace; + using paddle::platform::Event; + using paddle::platform::EventKind; + using paddle::platform::RecordEvent; + using paddle::platform::ProfilerState; + + ProfilerState state = ProfilerState::kCPU; + DeviceContext* dev_ctx = nullptr; +#ifdef PADDLE_WITH_CUDA + state = ProfilerState::kCUDA; + dev_ctx = + new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0)); +#endif + EnableProfiler(state); + + for (int i = 1; i < 5; ++i) { + std::string name = "op_" + std::to_string(i); + RecordEvent record_event(name, dev_ctx); + int counter = 1; + while (counter != i * 1000) counter++; + } + std::vector> events = paddle::platform::DisableProfiler(); + int cuda_startup_count = 0; + int start_profiler_count = 0; + int stop_profiler_count = 0; + for (size_t i = 0; i < events.size(); ++i) { + for (size_t j = 0; j < events[i].size(); ++j) { + if (events[i][j].name() == "_cuda_startup_") ++cuda_startup_count; + if (events[i][j].name() == "_start_profiler_") ++start_profiler_count; + if (events[i][j].name() == "_stop_profiler_") ++stop_profiler_count; + if (events[i][j].name() == "push") { + EXPECT_EQ(events[i][j + 1].name(), "pop"); +#ifdef PADDLE_WITH_CUDA + EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0); +#else + EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0); +#endif + } + } + } + EXPECT_EQ(cuda_startup_count % 5, 0); + EXPECT_EQ(start_profiler_count, 1); + EXPECT_EQ(stop_profiler_count, 1); +} From f266284d9f0fe173cd4d2efc97a3461781050372 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 18 Dec 2017 16:53:52 +0800 Subject: [PATCH 03/20] Fix the compiling for only CPU mode. --- paddle/platform/profiler_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index ed64ff40c9..5bd0a9d859 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -51,8 +51,6 @@ TEST(Event, CudaElapsedTime) { TEST(RecordEvent, RecordEvent) { using paddle::platform::DeviceContext; - using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; using paddle::platform::Event; using paddle::platform::EventKind; using paddle::platform::RecordEvent; @@ -61,6 +59,8 @@ TEST(RecordEvent, RecordEvent) { ProfilerState state = ProfilerState::kCPU; DeviceContext* dev_ctx = nullptr; #ifdef PADDLE_WITH_CUDA + using paddle::platform::CUDADeviceContext; + using paddle::platform::GPUPlace; state = ProfilerState::kCUDA; dev_ctx = new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0)); From 0a5fbb06508731aa55ffda3e4a68a9fabff2a72a Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 29 Dec 2017 18:04:03 +0800 Subject: [PATCH 04/20] Refine code struct. --- paddle/platform/device_context.h | 12 --- paddle/platform/profiler.cc | 149 +++++++++++++++++++++++++------ paddle/platform/profiler.h | 131 +++++---------------------- paddle/platform/profiler_test.cc | 12 +-- 4 files changed, 154 insertions(+), 150 deletions(-) diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 07e197ba0b..2b366e6383 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -115,18 +115,6 @@ class CUDNNDeviceContext : public CUDADeviceContext { cudnnHandle_t cudnn_handle_; }; -class DeviceGuard { - public: - explicit DeviceGuard(int device) { - original_device_ = platform::GetCurrentDeviceId(); - platform::SetDeviceId(device); - } - ~DeviceGuard() { platform::SetDeviceId(original_device_); } - - private: - int original_device_; -}; - #endif /*! \brief device context pool singleton */ diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc index 40b34b732c..4e89e5c600 100644 --- a/paddle/platform/profiler.cc +++ b/paddle/platform/profiler.cc @@ -17,34 +17,133 @@ limitations under the License. */ namespace paddle { namespace platform { -ProfilerState kState = ProfilerState::kDisabled; -uint32_t kNextThreadId = 0; -std::mutex kAllEventListsMutex; -std::list> kAllEventLists; -thread_local std::shared_ptr kEventList; -thread_local int32_t kThreadId; +// The profiler state, the initial value is ProfilerState::kDisabled +static ProfilerState g_state = ProfilerState::kDisabled; +// The thread local event list only can be accessed by the specific thread +// The thread index of each thread +static thread_local int32_t g_thread_id; +// The g_next_thread_id is a global counter for threads, by the g_thread_id and +// g_next_thread_id, we can know how many threads have created EventList. +static uint32_t g_next_thread_id = 0; +// The global mutex +static std::mutex g_all_event_lists_mutex; +// The total event lists of all threads +static std::list> g_all_event_lists; +// The thread local event list only can be accessed by the specific thread +static thread_local std::shared_ptr g_event_list; + +inline uint64_t GetTimeInNsec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + +Event::Event(EventKind kind, std::string name, uint32_t thread_id, + DeviceContext* dev_ctx) + : kind_(kind), + name_(std::move(name)), + thread_id_(thread_id), + has_cuda_(false) { +#ifdef PADDLE_WITH_CUDA + auto* cuda_dev_ctx = static_cast(dev_ctx); + if (cuda_dev_ctx) { + PADDLE_ENFORCE(cudaGetDevice(&device_)); + PADDLE_ENFORCE(cudaEventCreate(&event_)); + auto stream = cuda_dev_ctx->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + has_cuda_ = true; + } +#endif + cpu_ns_ = GetTimeInNsec(); +} + +std::string Event::kind() const { + switch (kind_) { + case EventKind::kMark: + return "mark"; + case EventKind::kPushRange: + return "push"; + case EventKind::kPopRange: + return "pop"; + } + PADDLE_THROW("Unknown EventKind."); +} + +double Event::CpuElapsedUs(const Event& e) const { + return (e.cpu_ns_ - cpu_ns_) / (1000.0); +} + +double Event::CudaElapsedUs(const Event& e) const { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(e.has_cuda() && has_cuda()); + PADDLE_ENFORCE(e.device() == device()); + PADDLE_ENFORCE(cudaEventSynchronize(event_)); + PADDLE_ENFORCE(cudaEventSynchronize(e.event())); + float ms; + PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); + return ms * 1000.0; +#else + PADDLE_THROW("CUDA is not enabled"); +#endif +} + +#ifdef PADDLE_WITH_CUDA +static void ForEachDevice(std::function func) { + auto original_device = GetCurrentDeviceId(); + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; i++) { + SetDeviceId(i); + func(i); + } + SetDeviceId(original_device); +} +#endif + +inline EventList& GetEventList() { + if (!g_event_list) { + std::lock_guard guard(g_all_event_lists_mutex); + g_event_list = std::make_shared(); + g_thread_id = g_next_thread_id++; + g_all_event_lists.emplace_front(g_event_list); + } + return *g_event_list; +} + +void Mark(const std::string& name, DeviceContext* dev_ctx) { + GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, + dev_ctx); +} + +RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { + if (g_state == ProfilerState::kDisabled) return; + dev_ctx_ = dev_ctx; + GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, + dev_ctx_); +} + +RecordEvent::~RecordEvent() { + if (g_state == ProfilerState::kDisabled) return; + GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id, + dev_ctx_); +} void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE(state != ProfilerState::kDisabled, "Can't enbale profling, since the input state is ", "ProfilerState::kDisabled"); - PADDLE_ENFORCE(kState == ProfilerState::kDisabled, + PADDLE_ENFORCE(g_state == ProfilerState::kDisabled, "The profiling state should be disabled when calling ", "EnableProfiler."); - kState = state; + g_state = state; #ifdef PADDLE_WITH_CUDA - auto ForEachDevice = [](std::function op) { - int count = GetCUDADeviceCount(); - for (int i = 0; i < count; i++) { - DeviceGuard dev_guard(i); - op(i); - } - }; - if (kState == ProfilerState::kCUDA) { + if (g_state == ProfilerState::kCUDA) { // Generate some dummy evenets first to reduce the startup overhead. for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { - DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(d)); + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); Mark("_cuda_startup_", dev_ctx); dev_ctx->Wait(); }); @@ -52,20 +151,20 @@ void EnableProfiler(ProfilerState state) { } #endif // Mark the profiling start. - Mark("_start_profiler_"); + Mark("_start_profiler_", nullptr); } std::vector> DisableProfiler() { - PADDLE_ENFORCE(kState != ProfilerState::kDisabled, + PADDLE_ENFORCE(g_state != ProfilerState::kDisabled, "Can't disable profiling, since it's not starting."); // Mark the profiling stop. - Mark("_stop_profiler_"); - kState = ProfilerState::kDisabled; + Mark("_stop_profiler_", nullptr); + g_state = ProfilerState::kDisabled; std::vector> result; - std::lock_guard guard(kAllEventListsMutex); - for (auto it = kAllEventLists.begin(); it != kAllEventLists.end(); ++it) { - auto& list = *it; - result.emplace_back(list->Reduce()); + std::lock_guard guard(g_all_event_lists_mutex); + for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); + ++it) { + result.emplace_back((*it)->Reduce()); } return result; } diff --git a/paddle/platform/profiler.h b/paddle/platform/profiler.h index 2242635024..47104ea9d0 100644 --- a/paddle/platform/profiler.h +++ b/paddle/platform/profiler.h @@ -24,76 +24,24 @@ namespace platform { enum EventKind { kMark, kPushRange, kPopRange }; -inline uint64_t GetTimeInNsec() { - // using std::chrono; - using clock = std::conditional::type; - return std::chrono::duration_cast( - clock::now().time_since_epoch()) - .count(); -} - class Event { public: - // the DeviceContext is used to get the cuda stream. + // The DeviceContext is used to get the cuda stream. + // If CPU profiling mode, can pass nullptr. Event(EventKind kind, std::string name, uint32_t thread_id, - const platform::DeviceContext* dev_ctx = nullptr) - : kind_(kind), name_(std::move(name)), thread_id_(thread_id) { - has_cuda_ = false; -#ifdef PADDLE_WITH_CUDA - auto* cuda_dev_ctx = - static_cast(dev_ctx); - if (cuda_dev_ctx) { - PADDLE_ENFORCE(cudaGetDevice(&device_)); - PADDLE_ENFORCE(cudaEventCreate(&event_)); - auto stream = cuda_dev_ctx->stream(); - PADDLE_ENFORCE(cudaEventRecord(event_, stream)); - has_cuda_ = true; - } -#endif - cpu_ns_ = GetTimeInNsec(); - } - - std::string kind() const { - switch (kind_) { - case EventKind::kMark: - return "mark"; - case EventKind::kPushRange: - return "push"; - case EventKind::kPopRange: - return "pop"; - } - PADDLE_THROW("Unknown EventKind."); - } + DeviceContext* dev_ctx); + std::string kind() const; std::string name() const { return name_; } - bool has_cuda() const { return has_cuda_; } #ifdef PADDLE_WITH_CUDA cudaEvent_t event() const { return event_; } - int device() const { return device_; } #endif - double CpuElapsedUs(const Event& e) const { - return (e.cpu_ns_ - cpu_ns_) / (1000.0); - } - - double CudaElapsedUs(const Event& e) const { -#ifdef PADDLE_WITH_CUDA - PADDLE_ENFORCE(e.has_cuda() && has_cuda()); - PADDLE_ENFORCE(e.device() == device()); - PADDLE_ENFORCE(cudaEventSynchronize(event_)); - PADDLE_ENFORCE(cudaEventSynchronize(e.event())); - float ms; - PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); - return ms * 1000.0; -#else - PADDLE_THROW("CUDA is not enabled"); -#endif - } + double CpuElapsedUs(const Event& e) const; + double CudaElapsedUs(const Event& e) const; private: EventKind kind_; @@ -108,11 +56,11 @@ class Event { }; struct EventList { - constexpr static std::size_t kMB = 1024 * 1024; - constexpr static std::size_t kEventBlockSize = 16 * kMB; - constexpr static std::size_t kEventSize = sizeof(Event); - constexpr static std::size_t kEventAlign = alignof(Event); - constexpr static std::size_t kNumBlock = + constexpr static size_t kMB = 1024 * 1024; + constexpr static size_t kEventBlockSize = 16 * kMB; + constexpr static size_t kEventSize = sizeof(Event); + constexpr static size_t kEventAlign = alignof(Event); + constexpr static size_t kNumBlock = kEventBlockSize / ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); @@ -139,58 +87,27 @@ struct EventList { }; enum ProfilerState { - kDisabled, - kCPU, - kCUDA, + kDisabled, // disabled state + kCPU, // CPU profiling state + kCUDA, // GPU profiling state }; -// The profiler state, the initial value is ProfilerState::kDisabled -extern ProfilerState kState; -// The global mutex -extern std::mutex kAllEventListsMutex; -// The total event lists of all threads -extern std::list> kAllEventLists; -// The thread local event list only can be accessed by the specific thread -extern thread_local std::shared_ptr kEventList; -// The thread index of each thread -extern thread_local int32_t kThreadId; -// The kNextThreadId is a global counter for threads, by the kThreadId and -// kNextThreadId, we can know how many threads have created EventList. -extern uint32_t kNextThreadId; - -inline EventList& GetEventList() { - if (!kEventList) { - std::lock_guard guard(kAllEventListsMutex); - kEventList = std::make_shared(); - kThreadId = kNextThreadId++; - kAllEventLists.emplace_front(kEventList); - } - return *kEventList; -} - -inline void Mark(const std::string name, - const platform::DeviceContext* dev_ctx = nullptr) { - GetEventList().Record(EventKind::kMark, std::move(name), kThreadId, dev_ctx); -} +void Mark(const std::string& name, DeviceContext* dev_ctx); struct RecordEvent { - explicit RecordEvent(const std::string name, - platform::DeviceContext* dev_ctx = nullptr) { - if (kState == ProfilerState::kDisabled) return; - dev_ctx_ = dev_ctx; - GetEventList().Record(EventKind::kPushRange, std::move(name), kThreadId, - dev_ctx_); - } + explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); - ~RecordEvent() { - if (kState == ProfilerState::kDisabled) return; - GetEventList().Record(EventKind::kPopRange, std::string(), kThreadId, - dev_ctx_); - } - platform::DeviceContext* dev_ctx_; + ~RecordEvent(); + + // The device context is used by Event to get the current cuda stream. + DeviceContext* dev_ctx_; }; +// Enable the profiling function. void EnableProfiler(ProfilerState state); + +// Return the event list of all threads. Asummed the returned value calls +// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> DisableProfiler(); } // namespace platform diff --git a/paddle/platform/profiler_test.cc b/paddle/platform/profiler_test.cc index 5bd0a9d859..47cf7be146 100644 --- a/paddle/platform/profiler_test.cc +++ b/paddle/platform/profiler_test.cc @@ -19,13 +19,13 @@ TEST(Event, CpuElapsedTime) { using paddle::platform::Event; using paddle::platform::EventKind; - Event start_event(EventKind::kPushRange, "test", 0); + Event start_event(EventKind::kPushRange, "test", 0, nullptr); EXPECT_TRUE(start_event.has_cuda() == false); int counter = 0; while (counter != 1000) { counter++; } - Event stop_event(EventKind::kPopRange, "test", 0); + Event stop_event(EventKind::kPopRange, "test", 0, nullptr); EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); } @@ -33,11 +33,11 @@ TEST(Event, CpuElapsedTime) { TEST(Event, CudaElapsedTime) { using paddle::platform::DeviceContext; using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; using paddle::platform::Event; using paddle::platform::EventKind; - DeviceContext* dev_ctx = new CUDADeviceContext(GPUPlace(0)); + DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0)); Event start_event(EventKind::kPushRange, "test", 0, dev_ctx); EXPECT_TRUE(start_event.has_cuda() == true); int counter = 0; @@ -60,10 +60,10 @@ TEST(RecordEvent, RecordEvent) { DeviceContext* dev_ctx = nullptr; #ifdef PADDLE_WITH_CUDA using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; state = ProfilerState::kCUDA; dev_ctx = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace(0)); + new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0)); #endif EnableProfiler(state); From 2a5c6a4435157868e337109a77cbed7320a3e7a3 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 29 Dec 2017 13:44:40 +0800 Subject: [PATCH 05/20] Fix the typo error and add more comments. --- doc/design/profiler.md | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/doc/design/profiler.md b/doc/design/profiler.md index 3b95bf0065..b20b5efdc1 100644 --- a/doc/design/profiler.md +++ b/doc/design/profiler.md @@ -1,44 +1,46 @@ ## Introduction -There are many performance analysis tools for [different programming languages and different software framework](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning framework, they used several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning framework, the PaddlePaddle also used C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU device. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile the only CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually wants to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tools. +There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool. ## Architecture -The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speed training, all the deep learning framework supports parallel computing, including multi-threads on CPU and multi-GPUs. So the profiler must enable to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to enable or disable by the developers. At last, the profiler should show a human-readable report. +The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report. ```python for i in xrange(M): # M is the iteration number - for op in operator_lists: # The `operator_lists` is the all operators in the network graph. + for op in operator_lists: # The `operator_lists` contains all the operators in the network. op.run(); ``` -In a summary, the proflier should have follow features: +In summary, the proflier should have following features: -- record time span in loop. -- support nested time span. -- support multi-threads/multi-GPUs. -- support to enable and disable the profiler. +- records time span in loop. +- supports nested time span. +- supports multiple threads/multiple GPUs. +- supports to be enabled and disabled by users. -But how to record the time for the mixed C++ and CUDA program? There many C++ interfaces to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different streams (http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. The CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device's perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summary and show statistics based on these events. +But how to record the time for the mixed C++ and CUDA program? There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events. -The overall flow is shown as following figure. +The overall flow is shown as the following figure.
### Event -In above work flow, a pair of events are needed before and aftern the piece of code to collect time. So the event has a flag to mark it is starting event or ending event. There three kinds of event: +In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text messageĀ is needed, for example, a marker to specify the profiling start or end. There are three kinds of event: ```c++ -enum EventKind { kMark, +enum EventKind { + kMark, kPushRange, kPopRange}; ``` -- kMark: only a mark. +- kMark: only a marker without time range. - kPushRange: mark the starting event for time range. -- kPopRange: mark the ending event for the time range. +- kPopRange: mark the ending event for time range. + +For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece. -For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, a event lists are used to record each piece. ```c++ class Event { public: @@ -64,7 +66,7 @@ struct EventList { }; ``` -As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or distable the profiler. +As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler. ```c++ enum ProfilerState { @@ -72,11 +74,11 @@ enum ProfilerState { kCPU, kCUDA }; -ProfilerState kState; +ProfilerState g_state; ``` -- kDisabled: the disabled state. -- kCPU: profiling for CPU code. -- kCUDA: profiling for GPU code. +- kDisabled: the disabled state. +- kCPU: CPU profiling state. +- kCUDA: GPU profiling state. A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`. From 10cd6eb67a7177bbf95300c0c8512650d27e57e5 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 12:51:25 +0800 Subject: [PATCH 06/20] Add doc for lod_rank_table. --- python/paddle/v2/fluid/layers/control_flow.py | 37 +++++++++++++++++-- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 22a37c22c3..48f1ffa668 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -397,9 +397,40 @@ class While(object): def lod_rank_table(x, level=0): - """ - This function creates an operator for creating a LOD_RANK_TABLE - using the input x. + """LoD Rank Table Operator. Given an input variable `x` and a LoD level, + this layer creates a LodRankTable object. A LoDRankTable object contains a + list of bi-element tuples and each tuple consists of an index and a length. + For given level's LoD information, the index is the sequence position and + the length representes the sequence length. Please note that the list is + ranked in descending order by the length. The following is an example: + + .. code-block:: text + + x is a LoDTensor: + x.lod = [[0, 1, 2, 3], + [0, 5, 6, 7]] + x.data = [a, b, c, d, e, f, g] + + Create lod rank table: + lod_rank_table_obj = lod_rank_table(x, level=1) + + Get: + lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)] + + Args: + x (Variable): Input variable, a LoDTensor based which to create the lod + rank table. + level (int): Specify the LoD level. + + Returns: + Variable: The created LoDRankTable object. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10], + dtype='float32', lod_level=1) + out = layers.lod_rank_table(x=x, level=0) """ helper = LayerHelper("lod_rank_table", **locals()) table = helper.create_variable( From 57bc564d12d5910f3f03d52ac9616b9e72ed4de2 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 14:01:17 +0800 Subject: [PATCH 07/20] Polish doc for lod_rank_table. --- python/paddle/v2/fluid/layers/control_flow.py | 31 ++++++++++++------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 48f1ffa668..458ced460a 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -397,25 +397,34 @@ class While(object): def lod_rank_table(x, level=0): - """LoD Rank Table Operator. Given an input variable `x` and a LoD level, - this layer creates a LodRankTable object. A LoDRankTable object contains a - list of bi-element tuples and each tuple consists of an index and a length. - For given level's LoD information, the index is the sequence position and - the length representes the sequence length. Please note that the list is - ranked in descending order by the length. The following is an example: + """LoD Rank Table Operator. Given an input variable **x** and a level number + of LoD, this layer creates a LodRankTable object. A LoDRankTable object + contains a list of bi-element tuples. Each tuple consists of an index and + a length, both of which are int type. Reffering to specified level of LoD, + the index is the sequence index number and the length representes the + sequence length. Please note that the list is ranked in descending order by + the length. The following is an example: .. code-block:: text x is a LoDTensor: - x.lod = [[0, 1, 2, 3], + x.lod = [[0, 2, 3], [0, 5, 6, 7]] x.data = [a, b, c, d, e, f, g] - Create lod rank table: - lod_rank_table_obj = lod_rank_table(x, level=1) + 1. set level to 0: + Create lod rank table: + lod_rank_table_obj = lod_rank_table(x, level=0) - Get: - lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)] + Get: + lod_rank_table_obj.items() = [(0, 2), (1, 1)] + + 2. set level to 1: + Create lod rank table: + lod_rank_table_obj = lod_rank_table(x, level=1) + + Get: + lod_rank_table_obj.items() = [(0, 5), (1, 1), (2, 1)] Args: x (Variable): Input variable, a LoDTensor based which to create the lod From a5200b89ac6b60b6e2f5e5a3eb374502e1285772 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 14:55:18 +0800 Subject: [PATCH 08/20] Add doc for max_sequence_len. --- python/paddle/v2/fluid/layers/control_flow.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 22a37c22c3..0f8295d177 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -414,9 +414,25 @@ def lod_rank_table(x, level=0): def max_sequence_len(rank_table): - """ - This function creates an operator to calculate the length of - max seqence through input rank_table(should be a lod_rank_table) + """Max Sequence Len Operator. Given a LoDRankTable object, this layer + returns the max length of batch of sequences. In fact, a LoDRankTable object + contains a list of tuples () and the list + is already sorted by sequence length in descending order, so the operator + just returns the sequence length of the first tuple element. + + Args: + rank_table (Variable): Input variable which is a LoDRankTable object. + + Returns: + Variable: the max length of sequence. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10], + dtype='float32', lod_level=1) + rank_table = layers.lod_rank_table(x=x, level=0) + max_seq_len = layers.max_sequence_len(rank_table) """ helper = LayerHelper("max_seqence_len", **locals()) res = helper.create_tmp_variable(dtype="int64") From 554f6967127fec6f6847802333e988565c726fbe Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 2 Jan 2018 15:03:10 +0800 Subject: [PATCH 09/20] for del DEPS --- paddle/operators/CMakeLists.txt | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index bfcc70b31d..9f603474de 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -186,36 +186,6 @@ endfunction() add_subdirectory(math) add_subdirectory(nccl) -set(DEPS_OPS - cond_op - cross_entropy_op - recurrent_op - softmax_with_cross_entropy_op - softmax_op - sequence_softmax_op - sum_op - pool_op - maxout_op - unpool_op - pool_with_index_op - conv_op - conv_transpose_op - nccl_op - sequence_conv_op - sequence_pool_op - lod_rank_table_op - lod_tensor_to_array_op - array_to_lod_tensor_op - max_sequence_len_op - lstm_op - gru_op - adagrad_op - sgd_op - save_op - load_op - send_op - recv_op - detection_output_op) if(WITH_GPU) op_library(nccl_op DEPS nccl_common) else() From 0d4fdce07f55957d2ade921dfb382c3f5ee790e8 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 15:04:01 +0800 Subject: [PATCH 10/20] Minor refinement. --- python/paddle/v2/fluid/layers/control_flow.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 458ced460a..08c52390e9 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -429,7 +429,8 @@ def lod_rank_table(x, level=0): Args: x (Variable): Input variable, a LoDTensor based which to create the lod rank table. - level (int): Specify the LoD level. + level (int): Specify the LoD level, on which to create the lod rank + table. Returns: Variable: The created LoDRankTable object. From 899a79cceb5b949d41d25a93c6c4d79446ba41b9 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 2 Jan 2018 15:51:53 +0800 Subject: [PATCH 11/20] Feature/transform (#7111) * "fix data transform" * "data transformer" * "add device pool" * "add test" * "fix ci" * "fix datalayout implementation " * "fix based on comment" --- paddle/framework/CMakeLists.txt | 2 +- paddle/framework/data_transform.cc | 79 +++++++++++++++++++++++ paddle/framework/data_transform.h | 67 +++++++++++++++++++- paddle/framework/data_transform_test.cc | 83 +++++++++++++++++++++---- paddle/framework/operator.cc | 2 +- paddle/operators/math/math_function.cc | 9 ++- 6 files changed, 222 insertions(+), 20 deletions(-) diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 6788cb34fb..b4458eb955 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -29,7 +29,7 @@ cc_test(variable_test SRCS variable_test.cc) cc_library(scope SRCS scope.cc DEPS glog) cc_test(scope_test SRCS scope_test.cc DEPS scope) -cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto) +cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto) cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context) cc_library(attribute SRCS attribute.cc DEPS framework_proto) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index 376268888e..58780e3863 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/framework/data_transform.h" #include "paddle/framework/lod_tensor.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace framework { @@ -23,5 +24,83 @@ DataTransformFnMap& DataTransformFnMap::Instance() { return data_transform_map; } +auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(), + DataLayout::kNHWC, LibraryType::kPlain); + +auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), + DataLayout::kNHWC, LibraryType::kPlain); + +auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), + DataLayout::kNHWC, LibraryType::kPlain); + +auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), + DataLayout::kNCHW, LibraryType::kPlain); + +void TransDataType(const platform::DeviceContext* ctx, + const KernelTypePair& kernel_pair, const Variable& in, + Variable* out) { + PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!."); + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_pair.first.place_, + kernel_pair.second.place_), + "TransDataType Only Support DataType transform on same place!"); + + auto src = in.Get(); + auto* dst = out->GetMutable(); + + auto dims = src.dims(); + dst->Resize(dims); + auto dst_type = kernel_pair.second.data_type_; + auto src_type = kernel_pair.first.data_type_; + + switch (src_type) { + case proto::DataType::FP32: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::FP64: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::INT32: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::INT64: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + case proto::DataType::BOOL: + framework::VisitDataType(dst_type, CastDataType(src, dst, ctx)); + break; + default: + PADDLE_THROW("Not support type %d", src_type); + } +} + +void TransDataLayout(const platform::DeviceContext* ctx, + const KernelTypePair& kernel_pair, const Variable& in, + Variable* out) { + PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!."); + PADDLE_ENFORCE( + platform::places_are_same_class(kernel_pair.first.place_, + kernel_pair.second.place_), + "TransDataType Only Support DataType transform on same place!"); + + auto src = in.Get(); + auto* dst = out->GetMutable(); + PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!"); + + dst->Resize(src.dims()); + auto place = kernel_pair.second.place_; + CopyFrom(src, place, *ctx, dst); + const std::vector axis = {0, 2, 3, 1}; + + auto src_type = kernel_pair.first.data_type_; + framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis)); + + dst->set_layout(kernel_pair.second.data_layout_); +} + } // namespace framework } // namespace paddle + +namespace f = paddle::framework; +REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType); +REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout); diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h index bd6d301c12..9abb3c99bf 100644 --- a/paddle/framework/data_transform.h +++ b/paddle/framework/data_transform.h @@ -21,16 +21,20 @@ limitations under the License. */ #include "paddle/framework/op_kernel_type.h" #include "paddle/framework/tensor.h" #include "paddle/framework/variable.h" +#include "paddle/operators/math/math_function.h" #include "paddle/platform/device_context.h" #include "paddle/platform/macros.h" +#include "paddle/platform/transform.h" namespace paddle { namespace framework { -using DataTransformFn = std::function; using KernelTypePair = std::pair; +using DataTransformFn = + std::function; + struct KernelTypePairHash { static void HashCombine(const OpKernelType& t, std::size_t* seed) { OpKernelType::Hash kernel_type_hasher; @@ -45,6 +49,65 @@ struct KernelTypePairHash { } }; +template +struct CastDataTypeFunctor { + HOSTDEVICE inline OutType operator()(InType in) const { + return static_cast(in); + } +}; + +template +struct CastDataType { + CastDataType(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx) + : in_(in), out_(out), ctx_(ctx) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + + template + void operator()() { + auto place = ctx_->GetPlace(); + + auto* in_begin = in_.data(); + auto numel = in_.numel(); + auto* in_end = in_begin + numel; + auto* out_begin = out_->mutable_data(place); + if (platform::is_cpu_place(place)) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); + } else { + // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type? + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + +struct CastDataLayout { + CastDataLayout(const framework::Tensor& in, framework::Tensor* out, + const platform::DeviceContext* ctx, + const std::vector& axis) + : in_(in), out_(out), ctx_(ctx), axis_(axis) {} + const framework::Tensor in_; + framework::Tensor* out_; + const platform::DeviceContext* ctx_; + const std::vector axis_; + + template + void operator()() { + auto place = ctx_->GetPlace(); + if (platform::is_cpu_place(place)) { + operators::math::Transpose trans4; + auto* context = static_cast(ctx_); + trans4(*context, in_, out_, axis_); + } else { + PADDLE_THROW("Unsupport CPU <-> GPU!"); + } + } +}; + using DataTransformMap = std::unordered_map; diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index 5f05e881fa..5b01c8434b 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include #include "paddle/framework/data_transform.h" +#include "paddle/platform/device_context.h" namespace paddle { namespace framework { @@ -31,16 +32,18 @@ using namespace platform; * 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN */ -std::array kDataType = { - {proto::DataType::FP32, proto::DataType::FP64}}; +std::array kDataType = {proto::DataType::FP32, + proto::DataType::FP64}; -std::array kPlace = {{CPUPlace(), CUDAPlace(0)}}; +std::array kPlace = {CPUPlace(), CUDAPlace(0)}; std::array kDataLayout = { - {DataLayout::kNHWC, DataLayout::kNCHW}}; + DataLayout::kNHWC, DataLayout::kNCHW, +}; std::array kLibraryType = { - {LibraryType::kPlain, LibraryType::kMKLDNN}}; + LibraryType::kPlain, LibraryType::kMKLDNN, +}; OpKernelType GenFromBit(const std::vector bits) { return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]], @@ -54,17 +57,20 @@ auto kernel1 = GenFromBit({0, 0, 0, 1}); auto kernel2 = GenFromBit({0, 0, 1, 0}); auto kernel3 = GenFromBit({0, 0, 1, 1}); -void TransDataType_t(const platform::DeviceContext* ctx, const Variable& in, +void TransDataType_t(const platform::DeviceContext* ctx, + const KernelTypePair& p, const Variable& in, Variable* out) { test_value++; } -void TransDataLayout_t(const platform::DeviceContext* ctx, const Variable& in, +void TransDataLayout_t(const platform::DeviceContext* ctx, + const KernelTypePair& p, const Variable& in, Variable* out) { test_value--; } -void TransLibraryType_t(const platform::DeviceContext* ctx, const Variable& in, +void TransLibraryType_t(const platform::DeviceContext* ctx, + const KernelTypePair& p, const Variable& in, Variable* out) { test_value += 2; } @@ -83,17 +89,68 @@ TEST(DataTransform, Register) { using namespace paddle::platform; auto& instance = DataTransformFnMap::Instance(); - ASSERT_EQ(instance.Map().size(), 3UL); - DeviceContext* ctx = nullptr; paddle::framework::Variable in; paddle::framework::Variable out; - instance.Get(std::make_pair(frw::kernel0, frw::kernel1))(ctx, in, &out); + DeviceContext* ctx = new CPUDeviceContext(); + auto pair0 = std::make_pair(frw::kernel0, frw::kernel1); + instance.Get(pair0)(ctx, pair0, in, &out); ASSERT_EQ(test_value, 1); - instance.Get(std::make_pair(frw::kernel1, frw::kernel2))(ctx, in, &out); + auto pair1 = std::make_pair(frw::kernel1, frw::kernel2); + instance.Get(pair1)(ctx, pair1, in, &out); ASSERT_EQ(test_value, 0); - instance.Get(std::make_pair(frw::kernel0, frw::kernel2))(ctx, in, &out); + auto pair3 = std::make_pair(frw::kernel0, frw::kernel2); + instance.Get(pair3)(ctx, pair3, in, &out); ASSERT_EQ(test_value, 2); } + +TEST(DataTransform, Layout) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto& instance = DataTransformFnMap::Instance(); + Variable in; + Variable out; + Tensor* src = in.GetMutable(); + src->mutable_data(make_ddim({2, 3, 1, 2}), CPUPlace()); + src->set_layout(DataLayout::kNHWC); + + DeviceContext* ctx = new CPUDeviceContext(); + + { + auto kernel1 = GenFromBit({1, 0, 0, 0}); + auto kernel2 = GenFromBit({1, 0, 1, 0}); + auto pair0 = std::make_pair(kernel1, kernel2); + instance.Get(pair0)(ctx, pair0, in, &out); + } + + Tensor dst = out.Get(); + EXPECT_TRUE(dst.layout() != src->layout()); +} + +TEST(DataTransform, DataType) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto& instance = DataTransformFnMap::Instance(); + DeviceContext* ctx = new CPUDeviceContext(); + + Variable in; + Variable out; + Tensor* src = in.GetMutable(); + float* ptr = src->mutable_data(make_ddim({2, 3}), CPUPlace()); + for (int i = 0; i < 6; ++i) { + ptr[i] = i / 3; + } + + { + auto kernel1 = GenFromBit({0, 0, 0, 0}); + auto kernel2 = GenFromBit({1, 0, 0, 0}); + auto pair0 = std::make_pair(kernel1, kernel2); + instance.Get(pair0)(ctx, pair0, in, &out); + } + Tensor dst = out.Get(); + EXPECT_TRUE(dst.data() != nullptr); +} diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index a3ce96c409..fc7091f1c8 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -461,7 +461,7 @@ void OperatorWithKernel::Run(const Scope& scope, dev_ctx->Wait(); for (auto var_name : need_trans) { - (*trans_fun)(trans_dev_ctx, *(scope.FindVar(var_name)), + (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)), scope.FindVar(var_name + framework::KernelTypeToString( expected_kernel_key))); } diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index d4f12f0a10..dcf4b85e1a 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -245,9 +245,12 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); From 7be57de9434053e7aa2e7b1d78da62ee1cb41ba7 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 16:55:51 +0800 Subject: [PATCH 12/20] enhance no_grad_var handling --- python/paddle/v2/fluid/backward.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index f11c83f59c..43e9abc354 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -57,6 +57,8 @@ def _all_in_set_(cands, s): """ Test if all elements of 'cands' are in set 's' """ + if len(cands) == 0: + return False for c in cands: if not c in s: return False @@ -138,10 +140,20 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): 1. all outputs of the grad op are in 'no_grad_set' 2. (TODO) all grad inputs of the grad op are in 'no_grad_set' """ + + def _op_can_be_removed_(op_desc, no_grad_set): + if _all_in_set_(op_desc.output_arg_names(), no_grad_set): + return True + if _all_in_set_( + filter(lambda name: name.find(core.grad_var_suffix()) != -1, + op_desc.input_arg_names()), no_grad_set): + no_grad_set.union(op_desc.output_arg_names()) + return True + return False + # Remove ops whose outputs are all in no_grad_dict op_descs = filter( - lambda op_desc: not _all_in_set_(op_desc.output_arg_names(), no_grad_set), - op_descs) + lambda op_desc: not _op_can_be_removed_(op_desc, no_grad_set), op_descs) # Insert fill_zeros_like_op to_insert = [] for idx, op_desc in enumerate(op_descs): From 8d4a607fb35a6eb9b5eacf9999f955bde911e2ad Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 17:30:40 +0800 Subject: [PATCH 13/20] update backward doc --- doc/design/backward.md | 6 ++++-- python/paddle/v2/fluid/backward.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/design/backward.md b/doc/design/backward.md index 35f03692bb..20fda7a98f 100644 --- a/doc/design/backward.md +++ b/doc/design/backward.md @@ -106,9 +106,11 @@ See function `_addup_repetitive_outputs_` in `backward.py` for implementation de In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass. -But these unnecessary gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. +Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped. -This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes whose outputs are all in `no_grad_set` or inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). +It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros. + +This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False). ### Creating Backward Variables diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index 43e9abc354..a1be768daa 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -138,7 +138,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): Remove unnecessary grad ops A grad op can be removed in two cases: 1. all outputs of the grad op are in 'no_grad_set' - 2. (TODO) all grad inputs of the grad op are in 'no_grad_set' + 2. all grad inputs of the grad op are in 'no_grad_set' """ def _op_can_be_removed_(op_desc, no_grad_set): From 1bcf7e23bdf23ad8a96cf75a42a37f1e45fea89b Mon Sep 17 00:00:00 2001 From: yangyaming Date: Tue, 2 Jan 2018 19:29:05 +0800 Subject: [PATCH 14/20] Minor refinement. --- python/paddle/v2/fluid/layers/control_flow.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 0f8295d177..114d46b5f8 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -415,16 +415,16 @@ def lod_rank_table(x, level=0): def max_sequence_len(rank_table): """Max Sequence Len Operator. Given a LoDRankTable object, this layer - returns the max length of batch of sequences. In fact, a LoDRankTable object - contains a list of tuples () and the list - is already sorted by sequence length in descending order, so the operator - just returns the sequence length of the first tuple element. + returns the max length of a batch of sequences. In fact, a LoDRankTable + object contains a list of tuples() and + the list is already sorted by sequence length in descending order, so the + operator just returns the sequence length of the first tuple element. Args: rank_table (Variable): Input variable which is a LoDRankTable object. Returns: - Variable: the max length of sequence. + Variable: The max length of sequence. Examples: .. code-block:: python From 33e75201e9d3c14945bbe556267b8bae069de327 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 2 Jan 2018 20:00:00 +0800 Subject: [PATCH 15/20] fix bugs --- python/paddle/v2/fluid/backward.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py index a1be768daa..ac60bf5436 100644 --- a/python/paddle/v2/fluid/backward.py +++ b/python/paddle/v2/fluid/backward.py @@ -142,12 +142,13 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): """ def _op_can_be_removed_(op_desc, no_grad_set): - if _all_in_set_(op_desc.output_arg_names(), no_grad_set): + out_arg_names = op_desc.output_arg_names() + if len(out_arg_names) == 0 or _all_in_set_(out_arg_names, no_grad_set): return True if _all_in_set_( filter(lambda name: name.find(core.grad_var_suffix()) != -1, op_desc.input_arg_names()), no_grad_set): - no_grad_set.union(op_desc.output_arg_names()) + no_grad_set.union(out_arg_names) return True return False @@ -296,7 +297,9 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): block_no_grad_set.add(_append_grad_suffix_(var.name)) no_grad_dict[block.idx] = block_no_grad_set elif isinstance(no_grad_set, set): - no_grad_dict = {0: no_grad_set} + no_grad_dict = { + 0: set([_append_grad_suffix_(name) for name in no_grad_set]) + } else: raise ValueError("'no_grad_set' should be a set or None.") From fba6a10dd99edf6110280754555af78889f19dd3 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 2 Jan 2018 21:00:09 +0800 Subject: [PATCH 16/20] fix bug in TransDataLayout (#7137) --- paddle/framework/data_transform.cc | 11 ++++++++++- paddle/framework/data_transform_test.cc | 14 +++++++------- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc index 58780e3863..9d6a842442 100644 --- a/paddle/framework/data_transform.cc +++ b/paddle/framework/data_transform.cc @@ -87,11 +87,20 @@ void TransDataLayout(const platform::DeviceContext* ctx, auto* dst = out->GetMutable(); PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!"); - dst->Resize(src.dims()); + auto src_dim = src.dims(); + dst->Resize(src_dim); auto place = kernel_pair.second.place_; CopyFrom(src, place, *ctx, dst); const std::vector axis = {0, 2, 3, 1}; + std::vector dst_dim; + dst_dim.resize(axis.size()); + for (size_t i = 0; i < axis.size(); i++) { + dst_dim[i] = src_dim[axis[i]]; + } + + dst->Resize(make_ddim(dst_dim)); + auto src_type = kernel_pair.first.data_type_; framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis)); diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc index 5b01c8434b..8665b6248f 100644 --- a/paddle/framework/data_transform_test.cc +++ b/paddle/framework/data_transform_test.cc @@ -32,18 +32,18 @@ using namespace platform; * 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN */ -std::array kDataType = {proto::DataType::FP32, - proto::DataType::FP64}; +std::array kDataType = { + {proto::DataType::FP32, proto::DataType::FP64}}; -std::array kPlace = {CPUPlace(), CUDAPlace(0)}; +std::array kPlace = {{CPUPlace(), CUDAPlace(0)}}; -std::array kDataLayout = { +std::array kDataLayout = {{ DataLayout::kNHWC, DataLayout::kNCHW, -}; +}}; -std::array kLibraryType = { +std::array kLibraryType = {{ LibraryType::kPlain, LibraryType::kMKLDNN, -}; +}}; OpKernelType GenFromBit(const std::vector bits) { return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]], From f3812825d06c2e9fb2311ea3890f70fc2dcf0836 Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Tue, 2 Jan 2018 13:43:47 -0800 Subject: [PATCH 17/20] Added documentation for topk (#6861) --- python/paddle/v2/fluid/layers/control_flow.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index a055cea1bf..588114a275 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -485,6 +485,30 @@ def max_sequence_len(rank_table): def topk(input, k): + """ + **topk** + + This function performs the operation that selects the k entries in the input + vector and outputs their values and indices as vectors. Thus topk_out[j] is + the j-th largest entry in input, and its index is topk_indices[j] + + Args: + input (Variable|list): The input tensor that has all the data. + k (int): The number of top elements that the function will pick. + + Returns: + Variable: The variable of type array that contains the k largest entries + from input. + Variable: The variable of type array that contains the indices of k + largest entries from input. + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10]) + k = 5 + array = fluid.layers.topk(x, k) + """ helper = LayerHelper('topk', **locals()) topk_out = helper.create_tmp_variable(dtype=input.data_type) topk_indices = helper.create_tmp_variable(dtype='int64') From e9a60e4c8e7f73b3b1e33cec4fd2d855055cd1eb Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Tue, 2 Jan 2018 14:45:42 -0800 Subject: [PATCH 18/20] Adding API docs for ones and zeros methods (#7150) --- python/paddle/v2/fluid/layers/tensor.py | 40 ++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py index e5820d24cd..9ce25a9e08 100644 --- a/python/paddle/v2/fluid/layers/tensor.py +++ b/python/paddle/v2/fluid/layers/tensor.py @@ -201,15 +201,47 @@ def fill_constant_batch_size_like(input, def ones(shape, dtype): """ - This function performs the same function as fill_constant() declared above - with the constant value being 1.0. + **ones** + + This function creates a tensor of specified *shape* and + *dtype*, and initializes this with 1. + + It also sets *stop_gradient* to True. + + Args: + shape(tuple|list|None): Shape of output tensor + dtype(np.dtype|core.DataType|str): Data type of output tensor + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + data = fluid.layers.ones(shape=[1], dtype='int64') """ return fill_constant(value=1.0, **locals()) def zeros(shape, dtype): """ - This function performs the same function as fill_constant() declared above - with the constant value being 0.0. + **zeros** + + This function creates a tensor of specified *shape* and + *dtype*, and initializes this with 0. + + It also sets *stop_gradient* to True. + + Args: + shape(tuple|list|None): Shape of output tensor + dtype(np.dtype|core.DataType|str): Data type of output tensor + + Returns: + Variable: The tensor variable storing the output + + Examples: + .. code-block:: python + + data = fluid.layers.zeros(shape=[1], dtype='int64') """ return fill_constant(value=0.0, **locals()) From 27fea24fd15a3d878df78786374820e78d83c045 Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Tue, 2 Jan 2018 14:46:13 -0800 Subject: [PATCH 19/20] Addign document for fluid split_lod_tensor and merge_lod_tensor (#6859) * Addign document for fluid split_lod_tensor * Adding document for fluid merge_lod_tensor --- python/paddle/v2/fluid/layers/control_flow.py | 64 +++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py index 588114a275..acc22bef98 100644 --- a/python/paddle/v2/fluid/layers/control_flow.py +++ b/python/paddle/v2/fluid/layers/control_flow.py @@ -16,6 +16,36 @@ __all__ = [ def split_lod_tensor(input, mask, level=0): + """ + **split_lod_tensor** + + This function takes in an input that contains the complete lod information, + and takes in a mask which is used to mask certain parts of the input. + The output is the true branch and the false branch with the mask applied to + the input at a certain level in the tensor. + + Args: + input(tuple|list|None): The input tensor that contains complete + lod information needed to construct the output. + mask(list): A bool column vector which masks the input. + level(int): The specific lod level to rank. + + Returns: + Variable: The true branch of tensor as per the mask applied to input. + Variable: The false branch of tensor as per the mask applied to input. + + Examples: + .. code-block:: python + + x = layers.data(name='x', shape=[1]) + x.persistable = True + + y = layers.data(name='y', shape=[1]) + y.persistable = True + + out_true, out_false = layers.split_lod_tensor( + input=x, mask=y, level=level) + """ helper = LayerHelper('split_lod_tensor', **locals()) out_true = helper.create_tmp_variable(dtype=input.dtype) out_false = helper.create_tmp_variable(dtype=input.dtype) @@ -32,6 +62,40 @@ def split_lod_tensor(input, mask, level=0): def merge_lod_tensor(in_true, in_false, x, mask, level=0): + """ + **merge_lod_tensor** + + This function takes in an input :math:`x`, the True branch, the False + branch and a binary :math:`mask`. Using this information, this function + merges the True and False branches of the tensor into a single Output + at a certain lod level indiacted by :math:`level`. + + Args: + in_true(tuple|list|None): The True branch to be merged. + in_false(tuple|list|None): The False branch to be merged. + x(tuple|list|None): The input tensor that contains complete + lod information needed to construct the output. + mask(list): A bool column vector which masks the input. + level(int): The specific lod level to rank. + + Returns: + Variable: The merged output tensor. + + Examples: + .. code-block:: python + + x = layers.data( + name='x', shape=[1], dtype='float32', stop_gradient=False) + y = layers.data( + name='y', shape=[1], dtype='bool', stop_gradient=False) + + level = 0 + + out_true, out_false = layers.split_lod_tensor( + input=x, mask=y, level=level) + out = layers.merge_lod_tensor( + in_true=out_true, in_false=out_false, mask=y, x=x, level=level) + """ helper = LayerHelper('merge_lod_tensor', **locals()) out = helper.create_tmp_variable(dtype=in_true.dtype) helper.append_op( From 87f46ebb368929feae76b7d909944b317d7dad92 Mon Sep 17 00:00:00 2001 From: Siddharth Goyal Date: Tue, 2 Jan 2018 14:46:49 -0800 Subject: [PATCH 20/20] Add squared error layers doc (#6862) --- python/paddle/v2/fluid/layers/nn.py | 32 +++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 55b35ad543..55d8bf8a8a 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -426,8 +426,36 @@ def cross_entropy(input, label, **kwargs): def square_error_cost(input, label, **kwargs): """ - This functions returns the squared error cost using the input and label. - The output is appending the op to do the above. + **Square error cost layer** + + This layer accepts input predictions and target label and returns the squared error cost. + For predictions, :math:`X`, and target labels, :math:`Y`, the equation is: + + .. math:: + + Out = (X - Y)^2 + + In the above equation: + + * :math:`X`: Input predictions, a tensor. + * :math:`Y`: Input labels, a tensor. + * :math:`Out`: Output value, same shape with :math:`X`. + + Args: + input(Variable): Input tensor, has predictions. + label(Variable): Label tensor, has target labels. + + Returns: + Variable: The tensor variable storing the element-wise squared error difference \ + of input and label. + + Examples: + .. code-block:: python + + y = layers.data(name='y', shape=[1], dtype='float32') + y_predict = layers.data(name='y_predict', shape=[1], dtype='float32') + cost = layers.square_error_cost(input=y_predict, label=y) + """ helper = LayerHelper('square_error_cost', **kwargs) minus_out = helper.create_tmp_variable(dtype=input.dtype)