From 0704b0b6ffe8906e463d1897657e938406e1d185 Mon Sep 17 00:00:00 2001 From: Ville Puuska Date: Sun, 15 Dec 2024 16:59:21 +0200 Subject: [PATCH 1/2] pass 'style_map' kwarg to mammoth when converting docx --- src/markitdown/_markitdown.py | 4 +++- tests/test_files/test_with_comment.docx | Bin 0 -> 12971 bytes tests/test_markitdown.py | 20 ++++++++++++++++++++ 3 files changed, 23 insertions(+), 1 deletion(-) create mode 100755 tests/test_files/test_with_comment.docx diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 96997cf..e3b028e 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -492,7 +492,9 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: result = None with open(local_path, "rb") as docx_file: - result = mammoth.convert_to_html(docx_file) + style_map = kwargs.get("style_map", None) + + result = mammoth.convert_to_html(docx_file, style_map=style_map) html_content = result.value result = self._convert(html_content) diff --git a/tests/test_files/test_with_comment.docx b/tests/test_files/test_with_comment.docx new file mode 100755 index 0000000000000000000000000000000000000000..8fc1745c71ef1f1d41429a4da95f8ff95e214a26 GIT binary patch literal 12971 zcmaia19+v$vv-n-ZEIpX6Wg|J+r}gl+s?$cjfri0Vq;?5WdGRNz2Duh&Qs@kPj^-I z+f{w4^^=wO1dI#-4h{}*?i{5C@EbwBKkL{VSvt_te0-NrcZ+|bhX>vC$kI>oaA{By zG^}k8o=g+<@{Jxn15J|@El}TBmxXMRRqn=ReAwS7%v-~L>2NQM#{mN=ZSf2B*E{aX z-gLUFYYUQUG}SS!%on^m(61Pl9)uK*x%W*T6V={NB@DEmakx?+riVeVcboaN)nax= z_j2NEJ*{R{?q63=S>&7*^STg`DX2k>2}7#huhZ|n_rx%R@z=Wobk!6V%%5OFec8*8q={2aZbBb zr;6x@W08fjReJ-~qo(ZXBW&ahag@^MBmC*v{e|L=3Jg=L`nzh)n9xbPgtbN0dT!9I z(r17TM!w(|OK}Rg+0LPhWyp+SS`2d^VqV}&iKzPLrn85B)h+W7a!mYLmNcuO&S zSH$OUk7^?Jt+N?6yiNtLZV*HP(q+~*VcC!CNmoc__55y#+ost26}odgbFX|(wWx{$ zw1{HjE4;BU`(_LpexWnYRV)uDO+gW6-dDZ!Jv(KpX@6*_*a92k*I(S=ncMw}F6qOj z%a#%T2D-Pr_;yvJ;HS?%%GCF{o7br&zvgK>flz4osq<+AZOV(C7<~XJ0iRzFm@8&m z4u3Y^X0=(r!=@ zR0kBl;N#0(-4TbFKWIv~GS)J7ZST4rd__X1jMoehLkhttKahA%rs*;!zZT>X0W_vWja@-6INeG zvWOkT?j^sUT_)~K(C3lYaZi=xDkttdt!6aKPYA1B*&au}Z91e+!IUhzOgT+*k+%`( zzKXcGSkGlnwCGN+4uj$+FrKT^vZ*3U@j{0?;)6SGcUlRp3~(f&89w81bV2w-YHRkH zbwWzKBezfSC~7Hvb4cYDnf*@cmjqJusm5AgS?=k^@k~z>-qiyyUZrbq{Uo;s#{zf>O0Q zFK7~i1P=j*QQ=##6(XDGFOF}*B(WLL5DMAIXWGd#MUMBRoNNQeu`ABWEd?Sy5O}gZ zTplsMNWdCyoZqk8eD<))^)^{S`{>u!Jb-V~`&(bW^Z%V1u-|K4Z0rqb|CK=BvxJkC zk+tJT-~N?MV@ITW>EMO#ULY-Qe~3X9@!PfVCDaIQeF|rEE4mc8^&Q;U;m$5r6r!qW zbK!J}ltX?e zG+E}1BLbQU?3S4raAd@YnC2 zI@MaUUui+;I9JGSu{S1Ci#6i7Ws~l=llEpGGQ%Wwk~!x^jF*iAh67Mnu6V@sYUdW~ z1mO*dHCmWTk@I3tA5tX&r$TMr@NSim{)wlvzK79@uq>*S?)2@2FDQgSIfRPDB%@Mi z{U*~Txw=i%wE)s?J_TyV?O_{^XIvTyMWvaH>T5txdP%KxewYH1jL^6gZBn(eaDD?? z>9b;fak$X^K>*5Q8Do6|=h_Bb*kcJY$XUMl{_3{8%)M%D0)*qqy4L?+cao}=R-d$s+ErJ-%N zEJ=uCI5AMnkFtcS6JmaCTe?!~yYWf-bGTK1BzXffO>ej(W4-}r022OJh!;4J@OkJWAIcwu*6C{it);jWV zo88%&?WgG0&R}I`ICY4og^Y8rO*;DcJYeh{bKDsh-uAf}&1uM`*JMGSz*tme^_8FM z1$*6J#AGBHkr?2~p9tIERu7k*WXO=U&?@rXY@Wr+9t%B5;VIvlTsbxbcKG;(UYF>q zpd8mUdiFrwhN68lX?l)Bz7Boy8V(andLdvgG}F2j2jFInJgLdLVYb$X{Kg|nWvlQ; z(oB$|N1R<*k(87S+|Dy-z8fs{a&ak)GQVj{-*e`_A48&G$8o9~o~An7+B?B?TRHZ0IFQf?pqrj}UOka+;_JHbeV)j9I!z?Lk@3 zedG;d%O3I{5h@)0#rk+Bi-ODKNSv7rMuq${5U!F;8Dfk^iD?!yCiO!8$Q2-Alm<#P zod|tS7Iv>d%mFK3C40(sgXgg-@=KsclnDG$fi#NPl=r~*dBrG*dsArAl-Ym>bIQst zfy(7Em>$F>b&hAOQZt=@rpQr^{U{HwVY4o%0T&6Pq68OQ3=+~Gw#l20zJn|@UhJ6O zOoygd&}-oVw3O9@@fHh+!-Y`B;47gs%z&tpJtsy5wY`eju7cm^12qhh%n&$*#7+F- zbmlL08O5jY-DtH7L(Dcf$AEPDsf;|>Qdwi683s~K3dYe}^d ze=TB+K(Ldy|67W04c#;hGQIQ{7@9dh%B^HJq()AcDs$86U#jBBN;2R4DoOJkOOzso z5)q3Msy|mvnzfuPh@+d((!E0O7tlvxxX|%4EYLSeqhPQxAXMia)hAtBu&71LP&?-~ zb>~Msy^+S>byTSnE4B4%wxtMh+vdd^>B>GEe>;_n=XT(Phsch69#tzVMPG6XSs+uW z?v(^rz@$Xc%urr}Bve8L*<#u9-{8}RvR2$mm-LHj>UER>9`wy876b3)A59Jwo%2m; z*J%QRa8dLES5(B{EJ6EpuGMH-*=B=G;-8kv8J128|kVs^vOn zND;6($nsaElxA+{xbN|>Ja?+^_(n2G0ig_{YqZVgaBgm8oaIG&Ar29gb5M3FcMhI(ORBttF=1Uj&W z(R@Bz03Tnl0t>cceqZ<_?5~s*N~TpLYYJbK826u3uJN1 zll2pHCDBB4I3~>Gu*1w2JH{jtNV6$EB-`trJV0xKbxYjti|=6ftLed9>RB_cVd~UM z64!l0cszkRLf;yMFgNZZ*iwZ;uoZKw5zTilBSrwEIreXqUxh6%R#>S<^d7JavuF%M zt)5|O-m2opSP^<6_aLn20|mmcmuDvVL_UcONfhHMyomGyQT1{=leB)1!D#4J0|1A` z-y#TbB;Tx=4jCiJ*>UE6Oe^4ouU@-8`LgKivSCGXc|6Uu01hCHUt@eid@nXE&4k}cX#qI zvI%6*U?x5ppO`L=IO@7VoOo~YnXe%mI@D_w4S@KuVvLX$2bLbW*yMMWQzQrur06EH zks3FJM|SeZtJy%1$Q`GO(vrZISbl)w>ft79fXwZnV>TmHfN)!Kr|Dm&ynqy`eJazy z{&GtZMY$w~SSFzC4V{j~Dy0@%)8L7ssb$^2k2ODR5q9YI@PL3DuD#eu0To}X{ z+2NZFljaTg-66y6E+5!=EbcB4b)+c}fitAH zgdLiXNh*a;4=PK(hnz?$PXV#uXxBhg_?GNUJ`@DGo5~BFsV1pJpnJk4|aJqxO*zUCSMm@>s@Fi9&rZpnZ zwKZu7Gq7Q%%^J+r)E7-Gn`Syib8~df@dxK_zeC5dRAP1ya@;se8e~GeXc4&`(@`u~ zGwmW&gEZo4Rt-IeLMC8QL{TjSnXS!z*&!SINkCheK0gT=tTH6Z0-rp%TxXFG3j4rZ zf&FPwU+P*7wC{zi`_QehMQbax@bjk;1Z`QRan?W`r9nl=D8Yyyq={117M=(LKPs?k z0I+E8&Av7seK{u<7^lhjsxSX!?ec}Rx7@HheIjFQ3l_Z!Ai}R&GI!bmz?Lc*2vDl! z^4n*?ZZ4@GR@0SXbbQ6pgBx+g~DjEKdwsKx(YCwQ!ziV`Igh+)?15 zopK>*T$Q;O(X1R5G)D_t?}f2mDqwDU^P5mILTi+FqfF$`4yJkL`pX48NrS+|Bx! z#H?9?c91byTifFHwr{>UWC8 zMx_0GrGiU!eLJbO$>x|gdO=#Ywkiw5F!!)Y;WV2tzrMjhH9SJ?iUGl?C->%-IeoHN zi1@7*7q@j!wwjP)OcVE5b|eT`7Tc{?#FV=&ht=)czI~btwq?jH{43NfYHka4V1RvPg3&zHaYJySpT>_O~VHJ8Ji=$Z_D6okT4S zPj0ykh~$ZtvyOj@O}|%jXBCZ~9lm5+ooo2M*W>W6+(}ULW|fS?X2Z4VnqxhqcF5p# zXmt3PKh%R9SOc`IK@~Oh*#NfWqUDk5yWViJpAi$%xbJk^E$@8o?Oiu@b%aJQ03-Dr z!EEq?oaA?9!EW7^n7t+Eg_>LgxbbT&vPxJ6y77UV1k;XhkO5S`=XScczQe8GT|mv| z06K7{m0u|YpyqY?G&z;{{*2rCN2|L6MGXV*tz!9F-|_rf#;gN5hy-YPEV;SMf929C zQ};pQe;Un!8Q4I{M2OUQo%r!Zrw(8ZhPTn9be_A`6tIQR*ZM)Z+{<_h$dLtL?Wz;6 zVS-BC_Y5@q)T0(PTSmzQY!YleEf6=0`@onI$H9>TqDh@9$?e(ju zzZ8LIW&xr%&aWU+(wW!PK)cESjBvn7_8X*_P^H@p+Ig>P^+4_g*=bMPoHun5;;v-pD_MwV*PMTXeigqOn?`~ghxHx!*}tN8pQgDZM+ zZq%%sxAkCFiT0o*s$abqWiA~+({+LcW0i7$x*+Ceo~ zD#Ksigp|!Xz`s^O8Zk``1a90YnVh>Kj%2OY9+a_+rcLv>z_ccg+*DEPUrH_Y@1SL& z{z5oSO{@tr;Yw^%O}FUnzBfW%ud+-N9V?Q-{H&sks42d1IkA)|G`;q+95uJU+V>(b zU<1o@C~d=nWtNy}!)W)bsd@L(8d|Mpw}kO!d*Fp#5bpxJI(%S>0IlUYv2q>aEfOqD z(%WvH<8U`3Yu#ItKSmTGZ-qx%Ow*nVzaJX6QAPZ3d0*+bQ^OkDGR#sln=&#GYTFQstE{eW-jVERzAg|^=W zSqTZ!=@G^Qz)i#h7zuk)khbcS@^MI!xYJ>UnVNJX@q0iNfDmYs0fTM%dPsXf9x}ku z@*D^>quWDl`D#eI#*)<6H870r<}s{@1yRGoJD-Jd zNw&vLxo5Kc_?O42toE-Z6CyKJ-AXTnQRp%_#&89%W~G zPRR+Pttz~THw=icI8vA$?yDDqiAF9NG3&u(h;W?Q0-A;)gZ0WV4_j4EgPqzo)@qmq z068y;Z{qhW_4=n=D~`C61HpmxFgBGsWoqbdj+DWr$9Ly#7S(b_1@_P|n}LH2)g--C z&OmPu96GOI1z-1Tz$5WOggvC|xbnJb1pLKFL=XWHVl|Dw*@-d)nYU%#@5omXW%E5H zUp!;Rg(H2YS#(E|>q{IBgJXDuFZ;%{ZbJnz*8#62s}<*~)BrPcdIfjpBIAS?AX3A4 zbNyxJDayTgr5(AX+ewFHcC&>Rfho=2UG|X}-4dqaN!P+w@oRgyYU(%cB~uB`lC4D7 z*_~7EXB1Zg#36I6DYhzLqHA3U(LH_ys|7o8HvI?@R^12@E^YoHQ%-+swurlPBIv=v zNnJ%rQ#uMvCUg`q%MGnE6yt`?mF1;AtPa%ouMI^ZH-xx;YBM^g3?1;E_t6wp6; z=^$Z1g|ZpEAWfelS~0bhcUp?cu73e5rs`)5_?6QS8}x2av&dU|1WG5gfZ*$}Ci2AE z6(iy1#qq5o(bo*xp_YOwvL~S`!r>fI@SOJgmw;UAW3*Z>qXL_|gVn$&gzJa$V4zIZ z{#jL8+ApJUTk~d5am436X)B`|t7pznrF1iKq|_t+?Z&q^fKuF9uvnw(*Z8XyBauC& zCWX1T{hNqOhxcYXFXB1IY-VNi%I~g;JLnp~No2pGQ+~ zYD+Jvw$qq1lW8L&f~*jy$ES(9)#5_Uzi($eb+BsyESaG(IzKsg8#E3G#^8V;Njt~S z+ZhF!LRRTKcqL=r6=ku$wYRzh{d}=I9(@>5Hr!tefCsq?;SX#$F&NQUxoTUk>XE1YNsJlRmfl+>p6#r?6yOa zuR`5j;;@F18hf^deBY zGerS1lakNzSaY@%z49;!Rf+=dS~#SJl;Iq6@WUzQ;Jd=inyY%n-@sK%J_EfgZwfN| z+7)8*vnjxQUo8SvVo3ckh)JOPE+Vb?3Ek!08si*%Nr-uS*tRyc(x_cnRp@7H{tEQ% zScksJ@9HJ09Lf?XzyszG_Pff|N@5PBi6tk^N)k$#YbC^8g&OsSMDyk93=AjA)Jw(v zN)jX`YoF%c3*;h76U!8LO7ewE?B?VN=bFriq?s%JRW7O6N@~(*J|zBLV4jDmLd`7y zE`9PYO^oy*DENoqnjOD$X(Hx_(TI;Kjz6p3r6oU#DBp{UW6U+{XGPytY83uc{$DFK zABO*HBL8lhh{zRqR(~vZG}l9W6-B~^asFj1AZt6Ikf3GXPlRLErAY1;&%Clze(^EF z=ISoxCdD1deb<@zWo{+=QNIM=hpfzBp}&Mw`;wqyTcoA8?@;~4N>xO3$whyiMuZVD) zOx%MGlm~^*^)-?DE;CY_bERZ!f?HRzYx5^<5as+@o^WxMu`l_h=h|pi zyeA%Qsf)}NnXY-jzv^-ovaFRC_{C;+OBx+|(=N-}_-*ijcZuQV@X?cSk8=jTD@6&2 zbHQpHRCA+YE987lM8xroE7Hm#6v!N3VHbBaOgIRZ&&fqaT&vW{4ABo73uUI9s*kDB$HZQc-1FR-L>^Md9%`(QG8Q1u6xaFPw10y9 zlT-ZAEJYl7zGT6K7|BkFWEn+@r&h60f{#a^X3$aoqGb!x=ugELLio{9ie!v@U1J}f zZpltZb+$yoJ>`&5a$*J_X+*R9=%s}$dH)A#+TN5$k?bUeQf+DSU-plUGb7Iyqhyhg zm&9zNv>Q#b6nU;H%V&BzyJ^#y4I|W*Dzz8fw%hIKp)=9uD;c3tR;eNN+%~$H<4Vm7 zN2j|U@NmMUOF6o{HGPj(cp>JcwQ@uPd7|G02{62oAeu#6EWdmSH4H!=3Kk_IT-7`( zNU1DcELc9X_~{V-(?Rmb{{6$H+X+si@H&a;I*DMLR3@H{PI57MY%w{4!p#1_tQ<=w zrdZl%u{4C3FCN}`USg?xqN#g=Ok-JiQX*@9W<;mRV$0+RW`z?`b+r6(-l$R%kvBH2 zS8+Vg{S7AeS8qxnXD<6U7uxYVHf^A`41v4dhRZ_0yB>2IN7>JAD&!~H`_>A**`of_3|L{7mK_g}9_gUbqQbo`bH$uYI&8)m zvA$to?-D##=BiVB`XOkaa-nH~sB(3TwG$>`?3LP|ztTH?F*);@QVQQ4Cm7Zkc`gcU zoMsNok!bjSi8GiN=ZtH`DPgJ+o+FK*9lR;mnG3~wGQ{Y7fxTAcyzWD1>B4s3YC9D7 zT)qN@s&ei@=UJPD%M&blV0?bX!{Zh4<{r+59?kSktYJ5b}kyVcWSo4xHOX+T|>Q zuI$&V`Oi+qkG;;+yQb}_F2U2@werU)tvM^ze2qIa+#Xn9&$XY4OO+XXa8y9uijB*F z&+!tJ3VKG08)AlQ{HPg1N5C1Obf^ztTYIlPfSt@c1T zFvBya1+Fa(yr58~GU%XUQeQD)uM#Y&eDCuT^Y|3Q<)m{uNR<1act;`!mp{lO2YO9! zUDkwrT+azUX|i-yGYXuUE=d{5)I|T*ak2fwbJxwtnqIT>i)G5Je)|b^Cclnb72kvM zMn4#q>(kM&E$rY{s-_1I<1f~Tuds88Q%6t!V2g5d={zyH9yYZ`@ksb@O+V#sGL;a! z1!ByvrNOeo<^uJZ-<)k3SUfwSY{J@={c*B&&Lw>ohEQM2JY+j#F^~zLlIqfCb)4F% z`%u@*3+E0k8PG?m7#C<*VJSnu`sbozDp}JAA`beGr{;wbsVP$jrQGE8r^l63n>E(b zu(>e68X+KFlv?j~HmBSjpRp8IKU2PJ`FvaxgB#`I8oc8(;%ooueB!^?#0EB2ANT2h zoKTc2YS^UFA$xCBRy|915f!VK=7pjk%cZP$>Qa(V?XUwvVnhpg0U!F6e*Pvt7CrH1f?} zE0Lcq41iCq1W6N1hWo3s-(jBu^PASRs0zOy?M#6o zY=bcpt~_e$LWe54$dsC?U^n(feRG**RKt_Xi!CdM zp6G*sZ(QT_x$OD&=Bf^rKLrTFzB=&M$>2vZ3G?QrX@>#161fL7j;axZ){U)9a${(z zaH$W;95xslYp8hZqog0GqfVc$^n=M8UVTxHGPz}M^LYF`lLYuf#5Z`o##^{=u`w5&kpzP4m*p`8$#~*|kN++MS zMvHrvdO`8Z+@az#hQOltt3U?Tms&=0t5(#%IWZ~H9q%dE0Mq`EaaPGCR+u-i$A44e zSi_ve{zl?9^244S=~<{E(CiJ!lstU2jWUb!DZP}q9!2=qsWa#4`8vFe`-sTXqH>ip zcDYp1rgI41BzNjOL&D1!>_?lTv~qhh1Wsr+J)9$0_W;p!<8D@xQIS@GaXtOh)-aC} zm@9iL$wDUFPH`-ZAFoQ-m-o+^oP&K2tx!Kv_FweXGVr4hF#H71NutktJ{Q!D1Rl=$ zr-Wt{D5gN_zV_4(8>c&;qreT-mTl9R-*?_ig8uzwoCi2pfZGq$mIRMgYA z`~x)B9xL?$2MN4O@43okKLz?;WENPY%7<2mt-CFaDe)RNK=kvr$&BLjV2*!c2G{kT z*AD4K5}t4^=qGX$DDDiy^zRAQg(n@GHY?{)vyT_;M0aWD)M5fH8k2w`?h24Ni<964MK{KJ*}|$V5s1pw}1+y5y)h zdxzaghI`{0)gVoZ7S}&2Qdpn~Hnbes4QA~PzjF^Zv0tC()&IKpb%ezgSb+q^!%tO* z+;@vNOyt^d8%E)LOzd6s6?GFFT?gz1R6RD+6w@AGm5%#c!GvLo(rOO-&iC~49cRpqU z)I(MJVZz^Org70hb$F36jPoGF*)lj&b+IfzIb$$di4VkAt_C28R{}@n||NNC4O^vLKX#e=8|2VSl zQ#rMMpAcR;3#*=enBlo|N#^QkmZPNN=Fb4J4Y-vJ;GJ7{PhIHaqj+0HB{-tNJGk#w z4K^&&G&i1UfZez2hb5Z3ZxaF$lFGszQmikcp4Oh=3E%NP#bm@Ao1awfji1g^E1deF zBD=IonAbKXkIkim^>qu5O3Io7I`Yb(O1`TV5>*`1W>W*(QH_v-j1iwbA3Hpc5Dj5; zQZE^F=Ua=J(-eUoQjAar#^iyRJ{$WUbRYybPRR2>tlt;+cL{n8guhk?b@#(LU*{e< zzRArVgoMI0j0O(frrrR2-*5V%iBs|E2i$mi6vuaqJ5T8mmVwKvW)JR9+2P$}d(~DG zjr@hZk-VqJKrYCOrR3f}I#aJjUf1nN@8^77_6;Nna6ORPI5yicGS$bq)uk@heeXhJ=-Z+!OB`{-K($LTehum;uq30jY=7{SlhM9A5CxVGKD+!ZUPL++Iqc zm5X$@Tviq*@$-(ZDmYdpvPvb!g0vSs@m#1nBTX6@x7TL?98xq+7QepgjeMXoSffqS zC*X|Tpt}9%+`27~5vTZ{@=Xny(yA=st)vkrtOyDFiZ<;SJ8_CPH`r!GlL@Bx)YA4l zKE>opYEf9o_FsP&I-@kQB11@+O2$j0O>ZPqheQi_qqZ^Z@C{6Fn50E7L}KbjFW;mK zw1UtMgUq2w(H9hg%p#L)>@7FXw<#W8{+<%JCQfNTWq~)#I1--?`pKeD)4^ISqU7eg)k|~VO1*4;iK&=F7fQ#U_+WYFl1%O%i?>N^NF)`uS4S53%y$~yXtPT+_&9d!)L(A z7IUtqb|CRsG~?qAIu4E)iV3d}NYh{~pdh&-Gr~9mGfD@de(x7%KUP;iE ztN{a{;{x~BLg7BG#kTmSqH$VN+6NbmKAMnJ`*j40ncDBUwTL$FDU?R^)~kv-<2}P_ z%MO4>bh!gQAxu5mX461+19G7v5<>)h9skS|KZZ7L;l|KWy@d3dZ?$giY~XwPR=tb) zg5^Fv+YKYKw~6hIX6r8{#PHqSfwi-uD}zU9XC-W39rOhR=$Z9Sm<*oX;Cv2&R#WF& zgwq>lm&dH8K0>*J4RDkXv}r|doWblDYWwKvM>OVF(2uNl+s9b${GRouKtI9;2!ssq zCocbYK>7zR|F85LO8@VQzXPK`X23t`^LtbO3%&k#_1{svAL!{nY5IMD{a5|pSn9uP z|BjXY7;b-(;d^HPulAp~-@hC99X$F0sr{2ULH-$F`*%yf-+%tq50>xl{$J43zpMYg zJ^8pK{gbZWJNAeApI4@TSN?rv_}8}Mi2kko`+nx{>c1~ None: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Test DOCX processing, with comments + result = markitdown.convert( + os.path.join(TEST_FILES_DIR, "test_with_comment.docx"), + style_map="comment-reference => ", + ) + for test_string in DOCX_COMMENT_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test PPTX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) for test_string in PPTX_TEST_STRINGS: From 0a7203b87653edf990e68e3d07e492625f35fe5e Mon Sep 17 00:00:00 2001 From: Ville Puuska Date: Sun, 15 Dec 2024 17:23:57 +0200 Subject: [PATCH 2/2] add style_map prop to MarkItDown class --- src/markitdown/_markitdown.py | 5 +++++ tests/test_markitdown.py | 9 +++++++++ 2 files changed, 14 insertions(+) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index e3b028e..f4487b6 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -856,6 +856,7 @@ def __init__( requests_session: Optional[requests.Session] = None, mlm_client: Optional[Any] = None, mlm_model: Optional[Any] = None, + style_map: Optional[str] = None, ): if requests_session is None: self._requests_session = requests.Session() @@ -864,6 +865,7 @@ def __init__( self._mlm_client = mlm_client self._mlm_model = mlm_model + self._style_map = style_map self._page_converters: List[DocumentConverter] = [] @@ -1038,6 +1040,9 @@ def _convert( if "mlm_model" not in _kwargs and self._mlm_model is not None: _kwargs["mlm_model"] = self._mlm_model + if "style_map" not in _kwargs and self._style_map is not None: + _kwargs["style_map"] = self._style_map + # If we hit an error log it and keep trying try: res = converter.convert(local_path, **_kwargs) diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index e069813..1a75ec7 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -150,6 +150,15 @@ def test_markitdown_local() -> None: text_content = result.text_content.replace("\\", "") assert test_string in text_content + # Test DOCX processing, with comments and setting style_map on init + markitdown_with_style_map = MarkItDown(style_map="comment-reference => ") + result = markitdown_with_style_map.convert( + os.path.join(TEST_FILES_DIR, "test_with_comment.docx") + ) + for test_string in DOCX_COMMENT_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test PPTX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) for test_string in PPTX_TEST_STRINGS: