From 20dd9a8d000bbae5813a3a6f705acb621aede180 Mon Sep 17 00:00:00 2001 From: fanghon Date: Tue, 3 Dec 2019 13:49:45 +0800 Subject: [PATCH] add jplag doc --- README.md | 19 ++- bin/.gitignore | 3 + bin/gui/plag/edu/PlagGUI$5.class | Bin 1417 -> 1442 bytes bin/gui/plag/edu/PlagGUI$6.class | Bin 3292 -> 3292 bytes bin/gui/plag/edu/PlagGUI$7.class | Bin 2222 -> 2222 bytes bin/gui/plag/edu/PlagGUI$8.class | Bin 1004 -> 1004 bytes bin/gui/plag/edu/PlagGUI$9.class | Bin 817 -> 817 bytes bin/gui/plag/edu/PlagGUI.class | Bin 5621 -> 5621 bytes bin/preprocess/plag/edu/TextExtractor.class | Bin 4587 -> 4588 bytes bin/shingle/plag/edu/ShingleSim$Fileter.class | Bin 996 -> 1043 bytes bin/shingle/plag/edu/ShingleSim.class | Bin 5367 -> 5367 bytes bin/utils/edu/AntFile.class | Bin 5235 -> 5193 bytes bin/utils/edu/FileIO.class | Bin 2445 -> 4644 bytes bin/utils/edu/StreamGobbler.class | Bin 1849 -> 1849 bytes bin/utils/edu/WinCMD.class | Bin 9661 -> 10996 bytes help.txt | 19 +-- out.txt | 112 +++++++-------- src/gui/plag/edu/PlagGUI.java | 3 +- src/jplag/doc/DocToken.java | 80 +++++++++++ src/jplag/doc/Language.java | 72 ++++++++++ src/jplag/doc/Parser.java | 127 ++++++++++++++++++ src/jplag/doc/TokenStructure.java | 36 +++++ src/jplag/options/CommandLineOptionsExt.java | 46 +++++++ src/preprocess/plag/edu/TextExtractor.java | 3 +- src/preprocess/plag/edu/Tokenizer.java | 23 +++- src/shingle/plag/edu/ShingleSim.java | 4 +- src/utils/edu/AntFile.java | 8 +- src/utils/edu/FileIO.java | 64 ++++++++- src/utils/edu/WinCMD.java | 65 +++++++-- testdata/doccn/dongxiao-2.html | 40 ++++++ 30 files changed, 633 insertions(+), 91 deletions(-) create mode 100644 src/jplag/doc/DocToken.java create mode 100644 src/jplag/doc/Language.java create mode 100644 src/jplag/doc/Parser.java create mode 100644 src/jplag/doc/TokenStructure.java create mode 100644 src/jplag/options/CommandLineOptionsExt.java create mode 100644 testdata/doccn/dongxiao-2.html diff --git a/README.md b/README.md index 5ec8d2d..4bfcde5 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # antiplag 程序代码及文档作业相似度检查软件 -软件主要检查、比较学生提交的电子档作业之间的相似度,能对多种编程语言(如java、c/c++、python等)、多种格式(txt、doc、docx、pdf、html)的中英文、简繁体文档(如实验报告)之间的文本相似度进行比较分析,输出相似度高的文档,进而辅助发现学生之间互相抄袭的行为。 +软件主要检查、比较学生提交的电子档作业之间的相似度,能对多种编程语言(如java、c/c++、python等)、多种格式(txt、doc、docx、pdf等)的中英文、简繁体文档(如实验报告)之间的文本相似度进行比较分析,输出相似度高的文档,进而辅助发现学生之间互相抄袭的行为。 ## 需求 [jdk11](https://www.oracle.com/technetwork/java/javase/downloads/jdk11-downloads-5066655.html) @@ -14,28 +14,31 @@ ![程序主界面](./maingui.png) ## 原理 -系统采用的主要技术是自然语言处理(nlp)中的文本相似度计算。程序类文本的相似度比较基于3个开放系统: +系统采用的主要技术是字符串及自然语言处理(nlp)中的文本相似度计算。 + +程序类文本的相似度比较基于3个开放系统: * 一是基于网络服务的[MOSS系统](http://theory.stanford.edu/~aiken/moss/)(斯坦福大学开放的支持多种编程语言代码相似度比较的系统); * 二是本地执行的[sim系统](https://dickgrune.com/Programs/similarity_tester/)(支持java、c等语言的文本相似度比较)。 * 三是本地执行的[jplag系统](https://github.com/jplag/jplag/)(支持java、c/c++、python等语言的文本相似度比较)。 本系统在它们基础上进行了二次开发和封装,针对moss系统,开发出了客户端存取模块,实现了代码文件提交、结果获取和解析、结果排序等功能;针对sim和jplag,则将其集成到系统中,在moss因网络故障等原因不可用时,可作为替代产品使用。 -中英文文档作业相似度的比较则基于[shinglecloud算法](https://www.kom.tu-darmstadt.de/de/research-results/0/1/shinglecloud/)(一种基于文本指纹的、语言无关的相似度快速计算方法),文档主要处理过程如下: +中英文文档作业相似度的比较提供了两种算法: + +第一种是基于[shinglecloud算法](https://www.kom.tu-darmstadt.de/de/research-results/0/1/shinglecloud/)(一种基于文本指纹的、语言无关的相似度快速计算方法),文档主要处理过程如下: 1. 使用tika读取不同格式(txt、doc、docx、pdf、html等)不同编码文件中的文本内容,并将其转换成能统一处理的文本; 2. 使用hanlp对文本进行预处理、分词; 3. 使用shinglecloud算法计算文本之间的相似度; 4. 根据相似度排序,输出比较结果。 +第二种是基于jplag的GST算法,对其功能进行了扩展,增加的“doc”语言类型,可以对各种文档进行相似度计算,并提供基于网页的可视化比对功能。 + ### 参考文献: 1. [Software Plagiarism Detection Techniques:A Comparative Study](http://www.ijcsit.com/docs/Volume%205/vol5issue04/ijcsit2014050441.pdf) 2. [JPlag: Finding plagiarisms among a set of programs](http://page.mi.fu-berlin.de/prechelt/Biblio/jplagTR.pdf) 3. [Winnowing: Local Algorithms for Document Fingerprinting](http://theory.stanford.edu/~aiken/publications/papers/sigmod03.pdf) moss系统采用的核心算法 4. [软件抄袭检测研究综述](https://faculty.ist.psu.edu/wu/papers/spd-survey-16.pdf) -## 更新情况 -1. 2019.12.1 使用hanlp作为分词组件,支持pdf、html文件文本的查重,修复若干bug,发布v2.8.6版。 - ## TODO 1. 将jplag整合进系统。已实现。 2. 支持html,jsp文件代码的查重。 @@ -45,4 +48,8 @@ 源于开源,还于开源,开源是美德,加星也是美德 :smile: 。 +## 更新情况 +1. 2019.12.1 使用hanlp作为分词组件,支持pdf、html文件文本的查重,修复若干bug,发布v2.8.6版。 +2. 2019.12.3 扩展jplag功能,提供“doc”语言类型,实现了对多种格式文档文本的相似度计算及可视化比对功能。更新使用帮助,测试数据,发布v2.8.8版。 + \ No newline at end of file diff --git a/bin/.gitignore b/bin/.gitignore index e960251..67c6176 100644 --- a/bin/.gitignore +++ b/bin/.gitignore @@ -1,2 +1,5 @@ /preprocess/ /gui/ +/jplag/ +/utils/ +/shingle/ diff --git a/bin/gui/plag/edu/PlagGUI$5.class b/bin/gui/plag/edu/PlagGUI$5.class index 035161053ee78fe0c8e1ba1be15bfdaa1bdbfaba..41d31781fb2a6f6a2e4f354f458ca025126fff69 100644 GIT binary patch delta 200 zcmXAi%L+kJ6o$WZ_Ri%HISeRC87P+tPhjYNog;?I+{n~;k4(LRl4Rfklz0KkJD8EI zvIgt&t$+P%71#uv^Y?fHN({En$M*ZES*YoJVm#{QUtptT`!Grmt)-dhqSJFLg+19d zRWs8mbI{2|Zb$Gy-mUH_Z$++%NbV6cPhB5hxO2r7lKz_!B_@;DQm27Mh#O5>@~ka1 ei25+{k}1B0DLTSzy5a|#IAQwIIdW%7mG}V!WhE8> delta 170 zcmZ3)-O0^$>ff$?3=9lL3_%;Y{xD7EXa2&O#lXbC3>3RGnTI8wv1D=%i!PU&kfY8v z2KAW?wv!jLSn+NIN;5ME0!cpx;mO}vykpNX2myHt5RDAN3?e`t&I&WR1mAdYTkk# zbt3b!qvAcxsk1*krePY5np2<6y0fBU4;Ov>m;@ORX2=s?#2E9!ggD>4GUbgK8RleJ zkmH9u76q1+Snp4@st^ zn33j}40FC%@Xe9}E2^xiv7yc%4YoAd(PMw#4tH^sjguUl<>4ZetNh#)@Cm)juAmM(@cpm_q=w6bIP= delta 39 ucmaFE{)T;n1~a3>WKHHMQC9{IMmGjNMt24YMh^yAMo$JMMz6^;m_q=vn+MVW diff --git a/bin/gui/plag/edu/PlagGUI$9.class b/bin/gui/plag/edu/PlagGUI$9.class index f96ccbbaeadba74f748303cc23ca94ee512958a5..dbb01820c40cb5523e90ae7b3a2b3c73ade56189 100644 GIT binary patch delta 31 mcmdnUwvlZ^4-;eH3#!vqECCY(HUSj_ zK>-#6P60gwT>(A=X8}M1asfdDe*sMcjsZ>sm;q1&qXAI^umM>EzX4hU$pKrFU=ykV TUXu$HJq2O_1p#BTO%$jBz*rnQ delta 113 zcmV-%0FM9lEA=a|nFs+hvziEb3Qf=g?EujN`~cDe1_0Cp5dhT#9sv;pECCV&HUSg^ zK>-y5P60dvT>(7x) JlXnXm004cR4Cnv= delta 52 zcmaE({91W~1}CHZWKB*P=7N;8&7Pc59FjQI_8;P7K8iaSWxC I_XrvQ0A?l(WB>pF diff --git a/bin/shingle/plag/edu/ShingleSim$Fileter.class b/bin/shingle/plag/edu/ShingleSim$Fileter.class index 89c83ef2a1f0dc1bab41c5fe33af5a294a9e9ae5..1e38153e2a0fe4e6f794523afd2cdd1ab3976848 100644 GIT binary patch delta 282 zcmXAjJ5Iwu6h+UB9mR|&$OHr;{KQ8>VPqg7m?ZpCM2LcxHBf+P=+n_7SFnaOG%j4Q z0M@}W5aME*H|Jfwd*^=D^Td7rlpjEc(YrHtJz?KaOYfc@?h_6T@gF*t=+1IPFJG|F z@r9xFGS|vW`#34FMDkzn6)YC4-UPo{u>D3B{B8wl@a+nAf)5p}1fN#W4E~@3CwJ$S zT0D)W3X5oHwsD$W5`EmD$1=xkkr6o2*5{l7w_}D}af;XJ#(xu<%xhC888VClMzZwP abRcyG`^~t{nvpP+U}Znk>vU?AdI`UHz$&!> delta 208 zcmXAjy$%6E6h_a?5^JI{BuEeu5@F(Rv3}ox)+6auTM>l^&}g<$cmU7gF+|*~X7bI+ zx!=8!|IV1#_wfYAZ0<&BV<l|3wv&Xa zv)v>VoOMYs;`UFZQ#bD>qaJyAoA}YitLy_7Oj%-uHU8BUD9TTk$Sna1p8*rGG&7-D W^oJlrT?q&UXy#G^MpPwLYxn_pSRZ@< diff --git a/bin/shingle/plag/edu/ShingleSim.class b/bin/shingle/plag/edu/ShingleSim.class index 8eef08c758c61cd60f4b780c62f27bd342f025b8..504264b45f83432039a2e1d4c66e567827a4de87 100644 GIT binary patch delta 338 zcmWO1-z&pW9LDkY*9ER-4bxIeBHvOjh+88PMSi5l{MuNiVRl0#$O}t+v&;`G z{1W7kRihZMQH9HBWP@5DrdD!EtJ`pdW!sWzn|{UmwEA@p5{Z}D^3JIA`(nnhav=R7 z4vwfNP7}wpazYzt`d@-El0u#fm=f?pkhIvBIeU4v;k;4AtvWM`$g1p)I_@!fpoK@; dd7_hN26)wzH+Fevj}Mi6a>|!N^YKtc2Lc0gh%3(^5(zE9HPVWJpRrlld6g2=m=eIFL&qM4}SfVB-kdSyQ zxWtMIFRwshuM9M2zH{cx{APz)Gy3ZdfCGNIX5#N{hBmrtNA+q{we?o5*0dfU4qg!0 zAk-nA`!^4_X6o(uB=MEJ>{F<04WGC>ThUl_9yhj6I$gOgMl-Ye?1}rNd1jXv_IV}X zjZ@w!NiUeNq2D5F%T{JIq=qs%LLB9aSW!ahBd+DT>d0{~N~*=njMB>bT9!cV_1QPN(=))ZL%f1u(^DL-$l(d8QUdyJprn+*>HuHLkZE z90c*V+Q~UQS!6u3A4Yo`i_y z!({jguTmT;Ra=Ztbmfh$T}Gsz9h+)|b9)1^i)xvr4;|BzRq u5RWDKRg6$d3`=av))vnK$3nTZs2tunrH?s@&0zmhiHEY&{qOQj@$ml;n0p4pXT$#&YO{;+pv&&+)D&3DfC z&6v;r{qa2jYw?)|mq1y{GGoKd(MYO!hZ&0oyEQ08PZ%eS=9m#5XzuPm5e-`c?oDRg zw4N3ysj2M|P_`u^Q5_{H^+JVPV19=gk9MU7`=iOdMt>|SP|=YH8?hcEY0B>`s949% zVc9o*-Btn5uyHafy%1Pe(_y<{CTzD_oi913xx*Z`T7wQ?dl<5kW_+MkdQ}=S2jYg6 zO43BbtIBVt%a^H9tfuy-Fa2ipm>K0Ui_9hLBDSRmjnbj*u|Sj6B&N9Yyz#o+X5 z@N%C||Bll$t4MpP-fyUXU z%&?O6wsj7gu|~scfqC|Z^1OMeyQdhXfhZL<%~3~ats2&|YgTS_tiuK`HsZ?yI!%x= zGiD_1sPB+&w@SA+2~^eWm>Ghca~?<$uap~IhiAOliZ)7*=A`o02rQ_XIk#1gZTDh( z;n=kvJj;m9)#cJbFP^hIt%SpiWV^$QP8o3ZloWuBO1Brgu$$>KY>rW*luJ-@8|~wi2qBlNlub-#17%a@ z^yK+xk}KBe5O*$uv=B4Q(UcLF#u>Mpd9EyTrjSH}pxDd)YyKupcck|phqphG^6Q1QbSS#?L_ zVV)m!HBWExK1r2_yNWoc!T*k9?SYw*)NnCJj7{s-WYQRwr-zOUDAn-&S)#m$YethA zUKjA^q8N z((xaAaShiQNF%}oD=Ou=g3+hr5`OB%4Z9%8Fv_!8PVSbEHL{yjxXrkezb=z0IFpI& zvP@Z-&iR}wjwOjINk)0{RnEeIKw(2nnvlDi0=D$_O@Y)FE)!i`VeoQcU6#4Zkfu zNJodQ=%5$(aG$%Oo%~)&?=GiKGKzyd1%8J=X!w0jR>j?3{81Kr&91C5ZbLDW1B{H| zp{!8$Z0eyzDjCk+app*I2fLf3&hP|XEaANjWS?%s&8Nn%X1;|)>n7m2#fA$ypRya= zJ9dOHzifo6T?eM$r&j#Nj4Z2t@o%nea{Lu5sEy^KgxAG6SS40dl2Q{5V8Hl4iW1o zHh{I_1hE7$i`Xc!mwEM(G8>XlaRw1Y`6XM&Fd(JJ)^V5=n{1uHi`22s*)a@q7gd}W zXRnHP_-(i7jfwUAziMai4TOsi>Z-k%9dfw)NJ3f|!AcvW6cA-LGuLMOH&hz@@D;4lon8Dye+C<*XJ>_ULj z*u=S;v6AEK(1s>#qm6cKzz!nM&_Ww8u-j;-9bLpi*hy<`Tx%O`wj)gJIB&6%>^-tx zkhUz@Y@H{uB%7`8u;t>v0aE-b{a9%eHbk;FWT|u0zC)**R&y;$UI?BHx2M?ACoB9J zk)-tP&hzGlyQsH_w&IC+PSQ!vSlJ!UTrZKaQZ95`bV}APFr&X0P((nT#3izKnZIwy z!uNyv3A{OhA9apH_bcS)oqY{y{G_+*)6734lka|l>VR6(;vR?3ulktgKC=1V15}63 zxrO}ZnCqFpv6SYy?rk9Zv@$LckKdDJUn5-Zzp zfZN3FqQI6^2k-9q2tUKm`Bret)=mk1lb-*QNEzOxl@GYvoh)UtD7fj3)X5NAXS0g& zaCJvJ9*)knaz(+h_G?;`x;h7Mh8xZ+=aMb03mR_nto)p{qCCS>aklR1;`8LhS74$@ zg0mUP3-J~s;~d<#m>IHov`nJ1x8kD--0rBaNZ-Mo&W7HA`(6vRMdeib-9BY)*N6C+ z5`1t@5kAGIUeEkpp7c(JFJVMLMd*xEIV1EWV^q)gdX`7YKpi8%Rq!Xq6CBlmhvb6) q!@#`!l$eV%v@8QMQhvBH6b)vYKU{DoHwa_v}AlfqUyb zlID{|L0WNA^2lhDtja8Fo1Au8)joM0u%;tcR53d?J29)7g%ngbJ2z{YbP1Qeaj zAj&wEL6LD9gBIg-23^J(3wF<3FqW{3vzQyJ$nWHQcU$Yq?*P{_D|p_Fma I##&iZ@#*E1uSufk>FfcLZ0!8x} zL>UVh6d4N{v>1yRbQy~o3>ZroOc+ZUtQgA}qJjKW#tMc^#!7}<#wvzF#%hLA#@fjT G*(v~P3l;YO diff --git a/bin/utils/edu/WinCMD.class b/bin/utils/edu/WinCMD.class index a0c58dc0cbf1ce69a5f1ed6e2c5a85236152d8bb..f642d82f384185188d917dadb45657527afe298f 100644 GIT binary patch delta 5157 zcmai234D~*wf~>YWR`Eftdj{D%`yQ}$WB-S41$1?nixQK!XggI5C<}oI5W{GwNqQP zwHB;?Rx7qBpiQ63s!k#iv{tdtqEE3dMOhSgtJc@{wa+c@f4@mA<^A68{Rp}Do_p`P zXZxRXz8t*xuS>LNzdmpXza1-Dbp)q5@CQM9 zO>I2frJ)M1=y(;c336JyqtRd_zBm*P&JA^Rgkl2A*zzU~bvUM@8LtZp$2OHWuMb=q zsO<l+F)CE?W{;#J*wl>V!SD^hXWge z7lxudGd9I&SgP_y2TluY6&1A=H5H22J38LQdje;yH4sruC{X=bw~Ws$ZsYFU{82OU zfsPOH5uXS}+JZgv*9oj+8_OFt%*4k!F2E;7Z$^b1eT!;|whZQmVVS3N}&NSa;5D9P~TSi~xD!~$0am7;=4#ua+0 zF=(?XAy&{4ZOUf$cre^$&TQ*K9q}MZ4o-rxc$BuWMgn18wJHj4kVxBgb;jam6wsoQ zsY{mfg0n5y5sWK87{?regt*lMovj)~JUU*)Q5r)==u&_Mf-I^tYi+EvqdOkFBoM#c zA?FA(YiidkBSeF-?v8k^Lkb0H)g2mqQmjh}wvuC@t<52&w1*0qx~UEsrOw10Ql`eJ zL&m5vtYu)Pl+!~}A@GbfvK^ixsiJYp98#lB^pGnqRjliD87~uzXPx!wGD$GXNY{KB zG`}VjWiovyQ;b@zs!;rb3{$+?OQN0a(Lh*}sdAn=Fx|LHD^AGyf;{tpFBA#ILxGOa zH9=qJ2A?`NOPAThdar5i42J`ew&qYIsP-<@We!}_E!rJX*7~+CI;c@2of;=@{0_O8 zB8TY>X`m|zF~*@V7Nkj#-#G7*C5zWI7^7W@w6bO;y@k>nPrFJC(W_*YQKNecSIVU- z3}cG_Wjgj?uO=&HtuC$7X0+-P%E&P{+<9gc&gDvzATs$&Lb|M%4aR^zd3>0TS{LmM z`(p7xWL;;p&94r0>e8hYRB^f_+M!Xns4fc;u1BfjzD#*Oni9QG@5eHWVLh**q zs3te#hj#g?@oMI4a;LF3E3fh{YSZeFUkdbCD7>cCR~_?JcTvE*sgAEY?qfQW?M!E~ z!}v6-RPHr$vONXA5@mhz5 zU4dvU*cf3#o5}#dRQbIwN8~xBQ82zJ6b^L+m@M2b=oZN|n)0UQbbH=om@jWVXgR$ScA_0{;!N>#TTG%Kcuwww-Ed!`)$$hC0 zbNcae5`Ro$NgwW-Th)&@W>^nmW{V~FtwnyTI$6mLueBd<`_sj5^QO1>wL|qzOTD8P z3%#1hG2U-i%)EAs*PdOI#QRAc?8l$TI``8=5=~_7w<$(dURxhN^Sex@UV9(@+K<2c zb+5~-=YG|X!A%bFI6ThngS)EsQcMed!`4BGjvwP3+=LSR1QpoKpxQz!-i!tKDWbTA zD&I<~f!j&DhriF^jdk=dkl2B9QOF6kmUQYb-29Q)csz~let>pd&W9?{BX(NVO4cE9 zaF)&o)`^ohYFUjJii>!(&c2+7o!s;i%XcDym z>+n4KfT=m<4b115#=BbOVK^%LC1cq>u2M>8as;+gp1a`0FHME=Q>VYtM>Dk$^L+~_ zs^9@Ck|x=_nkG5QZkEJK%9zV{m!z>i`E7LeWP7#W>6YBeB<{EqITh7O$)j6sNy%4q zuU{KI#U*}y0KS$!jPz%CGu(1+Qi_)4mZ>{iK}w}s_d-{DJ~da9k;9YGsfkxhCWf}Y zm7=IP8R5FAU&b;z+(LFLI?O%&F!B@=ilexB^`X(38k17XsPt!YCo?JIoXT{y?ER=z@3>_~9};uC*-4o>^bVG*)diON^j;MEonB`Wzpqf5 zH}oRUpOcz;a~@AkzTtCG3MKjWAc5i*ifl@a<(~8h=@gf!7Ul@Ye0CY2(^TC4inwR| z-9t}qrw?{8X6{8M$pz?T?8`Cnd z>{jA#Kvpu{cDGbK3=eZ@pRD%V6ztZ_t9}MH29esV=8oU)wR!EtR-iuJlfD&8Dh4Fj z(kJbYU<5C#|DMraYf>&Zs!Ph2buca!IVq7;NcPJWDkO(Pu-?q}r|7abS(M&leK^gc z^foKXJM_c5EJp9Ks=SYjXyfJlP-(|UOrW0_PZW)kX5;OmGFM!>sYnglB;Cj>9z9~^ zmsn3DQswe9RW{}qk4Tsq-z`_BM46X@U=KwcLb<8|tP)H?sx-Len$*D|Q&4sLurba& z^&Ot_+J}#x$szY5Qr+_1p}mW2{qjA(!|PCZzQF{y+^B$_r-(^x>V;;`WXu0=+Qk25 z+BDoq0B*{(o}-;* zFk5aFgQ0c@>~fp@jM&&|qU09RcBz@%ZmjgxdTzFT!Hgofj~QsVMBHKA?K}5uzonA= zop8D3=YCr+3f!_aRVqmNg`%j4E!>7Nh1tO!C_U39ZrMg}6ei^!v*xFc{_BCA!>+78 zjI!z^4j5BQT`3t%hmw?P7?d{|6De;h4-R{i#3660nQT$uam#%sd@6_@g^%84LwL(E zdl#!)9vH$#8t!M_e3A}SOYjD5RVUJ8ezf8pAG6_$~WIi#4iP+D4I~^xw20oBk z_)0D?v+gxae)I{9*(1-Jkn-)GA&=Ub`BNTsFvqLBcMJXfe9EJookXc<#=entiJO_H z)px-+Zr!E0rX65ps%Gyrofp3o#|LCLJ2~0Qo=*0)c&&Z%klD}mVy9b@gr)Q=dmqMa zUi+bK*ur*hyuypc>rjPS4q4mS*CpjQir6nt)jJ0;mANVL?|zM4p!0^LJnOa5hw{AN zWvSPDQRj7e^zl`^x(y@Q3t-+e$o6U`H7Q4X*+&d7Uf!hhE8punw0-coYOnUFyp{&G z7ej#kqc=S%Z?yPr&kQbCc(y7RpH$nNvBTU{y_dk7Op6WhlK^;*Meme39;2UA0FA0H zP#S5|CgjM)bZ;|CSh7aSJXEn2%Um zja4n+X|tYMSc4yvx8yWMF2p{0Ti&6&OR!tsHDO~`DMtPq!2fs8DYvco# zU?(@0vd8_9HI|<$@)6fAez35g<*v@2LFIx!@ybzi_AwuML(LdhR|ZaAPY$dOMqcVK|y2*vIIdK$q)y#IWxfs>U2S^ ziikcGK}D8o6ckX~2^v6At6iwAS`~{eh#S_uTC0NKcV`CN>aTxf?!D*Ud+s^s`_4Hp zpUv8Hj{e6JkL?98P#$?mf5BnB?p(M4e-orvRxN35^fd(ip|-|wl>;vek}4W?cyPeL zt5`2c_SM%ra8O`h;#=yg9`3+D)R~Y2htwE!;B_@N=@^SQ44i>C1@00n(~+D1wm^y< zII8yBmCg&)&wB>m#|PF4qsE4h1RiUDx+h)7hxo*Rj(-bEN~}+kU6#jXx3=o(1sXmR zq{YmvnjUOv2>STer}$i*{;#z?t!EOxw4O`LyM)4H299HfAfv$_Ubv{Gsma$|KPAxY zR~JqgI0+{m3br*nLzrPjvu9IH=8H_5d|WSIGG$y26> zbcl2}umd}FaZA1-1=7=M%@|bd5o9;DghEvcnJRyMTh+N->Cy}7{E-%`JlSe2QxG8v)zK#*|C5W*=nR&Tw`kYR$9`DK-54)I#6GYe*nq&knMqRCS| zz*Eue88~3T0FOgP6KW5~u{6vx+M~Q^X$x0|+rzqymoqgvLy)C-RMqHfZm6nlYYqpR z{Dw?mNKa(;bBvd>4Xlw#)+#;MIwLFBevX{0$rNj0R<3(0E+l4``2yjIEkRwT<62E- zSl4GAh|DA4p+M6@rPxyu@>I0ag!#m2kwY#J7;10P=?!X0@l=F8I;!O&15;#?Airc% z>6CBnni&oTnj6Nq1sd!9!4VGe$DM8QP*cJmzR)1hm4IQ5c}P41hAfdwtWUCQT}_N@ z9Gd{ADT`Hjr@1cgeJRn6y%cDvQZcXf`}l}1t&*n+$(3tm<>uDv5|X8)iCjijmc&it zh4|qSrL!G!Ig?_3{qpMmgRYz(DnGw?fiBDCN=>d{FuujP*5BZ7H{>dbSTk}aXa3xf zU&t>R#z1quzkS+bYg0~s_BXH}4f= z0X1BJN?eEFLH#4C-OHr70m--+1_Cr~35swDs?kW( zns~e!v(Unems)ds94c6fBwR+b+iCVPKDr!3a0R@$3S+RsIxl}-M6eBIJoP+Fe-d7x zRwV79`{{t#lWJ@gUVHY7Uae>tvpTg4p1MvHdUaFJekqDq=42O_N3nK2(oCI`SLSrU z$*sSuDaY3m<3Ho0sU_Bj<6{@9>muD8H|)ZZPE5#tD~icc%!=)$J_8pOypy4R?D0}yP*8+coi85xt2x(dDcdmVAO9(LS7s;nfm zH!{krxWAgv-pr`pLR>5|aSb88jTBjnDY%^^ScmyoPh9T66%pJ?#@xm0ck|drY``Wy zbq}Jr7yED@4&r_sjy=8&Z{qsL_irMdxJ)efeCRl%>+658xBkkv8CBNh7$=cZSN0pYJ8?&+ zbo1KFQYxZiT4M{%-aTq;ZrggyD(jM7b)C}tx9G-`>d#$drbOAwmlYPz=}USjaa2m; z1lc9!iXcc3qlRGsx)|}@%=0}=xxFOA<4n>gnB)7H-%m18pTgOAnib%8MB*9Tgg;oH z7xod&>fWc=SuF!;bS3I#kTtnaQN(>5OEk36z0x1Lm*nM1o!A=6J<_~lmkg$XE*Tm( zZmJTxWH@C^PnBI>yAql9QwK+!5|^n!cr{ad7Ir#x`qcQCzEM1;()Y(Ivd^Z}C1XzQ z?UHd`hv_iwQ8~+-YNkeIvQ<^&?ltZI2RHIl!A)YOsl>QCHg02Ik7waLatwHm!Tu}q zSi8#cA`9?Ktb#9NDqbPW4v-xOdFv4K@-Xx7b?P~SEqIev?=7awJ9wC1nw{k4llah} zijR2YW2VBVmaAAHGPJmONG+pmr_!hRluD6R$ds9K9*MOUn#_{fOpz_jr`1e)2bFRr zbF7<-s}@YRA7i~2=JUO$O03LfdLU^GT}UNzAadD}I4VCvFJ4{O!ReAZue}3#E;(P- z+n8j5RnXTgP2jzd;m?nXFNSc(_YsT%SZ^)t+oNynbPTbI@8U(&<9HD`h1bTu?(%6W zRa~-|nc~&TOs!nyz{PVC36w>#nUdm?M)Kb}+&3q33X^7*Bj5{S@+Cv{6-(1GHo4=Z z?$;QK6RdNjps9H<=$cr@%vz~f-D#o^HiVGJJHb_l(t*?Om6 zb!0lLrury5jZsSsqeqF6%H|`Cd3YUO>yp;G@=ghQQ@SKnXQp&Yo7W}^n9VL}S2?vz zCD1dlOl|K5+-Pda163|0n+`TN`B_SR6nmnwLdkZ?wKZ&~!&zxoMpQp_r(P448%;aW zk(<5FhO`)2(ii#C551&>#lIB&r3^!*KSoIfzpyGXSE^X-2OuET2(q5F%OHX~ z7%OE6*2z$8p=`U1K&OnvZi$S>J}TNTlkmKpgI8q=|CgMK!%`bVcQZQ?K_IVUZ4O7I znVpstqD(bBwmHsJb4x}Ry5t_YmnP?9r*z1DM7tMT=KVZpM}OQZTiLxdmf&TwjS>fo z@WO~Zz`8u`rNw$#8&_<2@f~YTS!qV%NTQwHW$iD^%iJ tokens = HanLP.segment(strs[line]); + int col = 1; + for(int j=0;j table = new Hashtable(); + protected String[] reverseMapping = null; + protected int serial = 1; // 0 is FILE_END token + + protected void createReverseMapping() { + if(this.reverseMapping == null) { + this.reverseMapping = new String[this.table.size() + 1]; + for (Entry entry : table.entrySet()) { + int type = (entry.getValue()).intValue(); + String text = entry.getKey(); + this.reverseMapping[type] = text; + } + } + } + + public Set> entrySet() { + return this.table.entrySet(); + } + + public String tableStatus() { + return "Size of table: " + this.table.size(); + } +} diff --git a/src/jplag/options/CommandLineOptionsExt.java b/src/jplag/options/CommandLineOptionsExt.java new file mode 100644 index 0000000..0af03c1 --- /dev/null +++ b/src/jplag/options/CommandLineOptionsExt.java @@ -0,0 +1,46 @@ +package jplag.options; + +import jplag.ExitException; + +public class CommandLineOptionsExt extends CommandLineOptions { + + public CommandLineOptionsExt(String[] args) throws ExitException { + super(args); + initLangs(); + } + + public CommandLineOptionsExt(String[] args, String cmdInString) throws ExitException { + super(args,cmdInString); + initLangs(); + } + //ʼԼϣڴֵ֧ + void initLangs() { + String[] langs= {"doc","jplag.doc.Language"}; + addLanguages(langs); + } + public String[] getLanguages() { + return this.languages; + } + + public void addLanguages(String[] langs) { + String[] strs = new String[languages.length+langs.length]; + System.arraycopy(languages, 0, strs, 0, languages.length); + System.arraycopy(langs, 0, strs, languages.length, langs.length); + this.languages = strs ; + } + public static void main(String[] args) { + // TODO Auto-generated method stub + String[] langs= {"doc","jplag.doc.Language"}; + try { + CommandLineOptionsExt cmdop = new CommandLineOptionsExt(langs); + cmdop.addLanguages(langs); + for(String str:cmdop.getLanguages()) { + System.out.print(str+","); + } + } catch (ExitException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + +} diff --git a/src/preprocess/plag/edu/TextExtractor.java b/src/preprocess/plag/edu/TextExtractor.java index bbd33ea..3bc3ebd 100644 --- a/src/preprocess/plag/edu/TextExtractor.java +++ b/src/preprocess/plag/edu/TextExtractor.java @@ -105,9 +105,10 @@ public static String fileToTxt(File f,Metadata metadata) { public static void main(String[] args) { // TODO Auto-generated method stub // File f = new File("./testdata/doccn/dongxiao-2.doc"); - File f = new File("./testdata/doccn/dongxiao-2.pdf"); + // File f = new File("./testdata/doccn/dongxiao-2.pdf"); // File f = new File("./testdata/doccn/dongxiaogbk.txt"); // File f = new File("./testdata/doccn/dongxiaoutf8-2.txt"); + File f = new File("./testdata/doccn/dongxiao-2.html"); System.out.println(TextExtractor.getTxt(f)); Metadata metadata = new Metadata(); System.out.println(TextExtractor.fileToTxt(f,metadata)); diff --git a/src/preprocess/plag/edu/Tokenizer.java b/src/preprocess/plag/edu/Tokenizer.java index 7d9037b..0c44e60 100644 --- a/src/preprocess/plag/edu/Tokenizer.java +++ b/src/preprocess/plag/edu/Tokenizer.java @@ -5,7 +5,9 @@ import com.hankcs.hanlp.HanLP; import com.hankcs.hanlp.dictionary.CustomDictionary; import com.hankcs.hanlp.seg.common.Term; +import com.hankcs.hanlp.tokenizer.IndexTokenizer; import com.hankcs.hanlp.tokenizer.NotionalTokenizer; +import com.hankcs.hanlp.tokenizer.StandardTokenizer; public class Tokenizer { //ַתָָķִʹַ @@ -23,7 +25,8 @@ public static void main(String[] args) { // TODO Auto-generated method stub HanLP.Config.Normalization = true; //->壬ȫ->ǣд->Сд CustomDictionary.insert("4G", "nz 1000"); - String text = "i am from china.Сеķιèеľȴ޳ɡιЩС,i will go back HomeҐ "; + String text = "i am from china." + + "Сеķιèеľȴ޳ɡιЩС,i will go back HomeҐ "; System.out.println(text); //ȷִ List tokens = HanLP.segment(text); @@ -33,6 +36,24 @@ public static void main(String[] args) { } System.out.println(); + System.out.println("ȷִ"); + //׼ִ + tokens = StandardTokenizer.segment("Ʒͷ"); + System.out.println(tokens); + for (Term token : tokens) { + System.out.print("("+token.word+","+token.offset+","+token.length()+")"); + + } + System.out.println(); + System.out.println("ִʣ"); + //ִ + List termList = IndexTokenizer.segment("ʳƷ"); + for (Term term : termList) + { + System.out.println(term + " [" + term.offset + ":" + (term.offset + term.word.length()) + "]"); + } + System.out.println(); + System.out.println("ȥͣôʡŷִʣ"); // Զȥͣô,ᶪʧԭļеλϢ tokens = NotionalTokenizer.segment(text); System.out.println(tokens); // ͣôʵλdata/dictionary/stopwords.txt޸ diff --git a/src/shingle/plag/edu/ShingleSim.java b/src/shingle/plag/edu/ShingleSim.java index 5b1c776..9a283f7 100644 --- a/src/shingle/plag/edu/ShingleSim.java +++ b/src/shingle/plag/edu/ShingleSim.java @@ -43,7 +43,7 @@ public void explore(File file) { } } - // ʵļ˽ӿڣڲ෽ʽ,ֻdoctxtdocxpdfļĿ¼ + // ʵļ˽ӿڣڲ෽ʽ,ֻdoctxtdocxpdfhtmlļĿ¼ class Fileter implements FileFilter { @Override public boolean accept(File arg0) { @@ -53,6 +53,8 @@ public boolean accept(File arg0) { || fn.endsWith(".txt") || fn.endsWith(".docx") || fn.endsWith(".pdf") + || fn.endsWith(".html") + || fn.endsWith(".htm") || arg0.isDirectory()) return true; return false; diff --git a/src/utils/edu/AntFile.java b/src/utils/edu/AntFile.java index 4bf3d5d..b4c6efd 100644 --- a/src/utils/edu/AntFile.java +++ b/src/utils/edu/AntFile.java @@ -184,13 +184,13 @@ public static void copy(File srcdir,File desdir,String match){ public static void main(String[] args){ File src =new File("./demo/7/Selenium.zip"); //֧rarļĽѹ - File dest=new File("./demo/7/"); - AntFile.unzip(src, dest); + File dest=new File("./testdata/doccn/"); + // AntFile.unzip(src, dest); //AntFile.deleteFile(src); //pass test //AntFile.deleteDir(new File(dest.getAbsoluteFile()+"\\zhengchaota_atm")); //ȡָĿ¼µjavaļĿ¼µ - String[] filter={"**/*.java"}; //"*.zip" + String[] filter={"**/*.doc"}; //"*.zip" String[] files = AntFile.scanFiles(dest, filter); if(files!=null){ for(String str:files){ @@ -199,7 +199,7 @@ public static void main(String[] args){ } //ڵǰ·´һĿ¼ - AntFile.makeDir(new File("./temp")); + // AntFile.makeDir(new File("./temp")); } diff --git a/src/utils/edu/FileIO.java b/src/utils/edu/FileIO.java index beef207..ae24fd3 100644 --- a/src/utils/edu/FileIO.java +++ b/src/utils/edu/FileIO.java @@ -1,8 +1,16 @@ package utils.edu; +import java.io.BufferedReader; +import java.io.BufferedWriter; import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.util.ArrayList; import java.util.Date; import java.util.List; @@ -42,13 +50,67 @@ public static void saveFile(File outfile,List listsd,int type,String la } } } + //strָ뷽ʽдļ + public static void saveFile(File outfile,String str,String encode){ + BufferedWriter fr = null; + try { + fr = new BufferedWriter (new OutputStreamWriter (new FileOutputStream (outfile,true),encode));; + + fr.write(str); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + }finally{ + try { + if(fr!=null) + fr.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + } + + public static String[] readFile(File infile,String encode){ + BufferedReader in = null; + String str = null ; + ArrayList list = new ArrayList(); + String[] res = null; + try { + in = new BufferedReader(new InputStreamReader(new FileInputStream(infile), encode)); + while ((str = in.readLine()) != null) { + list.add(str); + } + res = new String[list.size()]; + for(int i=0;i lists){ int res = -1; + File tmpf = null; + long t = System.currentTimeMillis(); + try { String INPUT_FILE_FOLDER_NAME=files ; //ļĿ¼ + + if("doc".equals(lang)) { + tmpf = preJplag(files); + INPUT_FILE_FOLDER_NAME=tmpf.getAbsolutePath() ; //ļĿ¼ + } + String jplagResultsFolderName="./jplagresult/"; //ĿĿ¼ + // AntFile.deleteDir(new File(jplagResultsFolderName )); //ɾĿ¼ + float MINIMUM_FILE_SIMILARITY = threshold ; String EXCLUDE_FILES = null ; ArrayList args = new ArrayList(); @@ -156,24 +190,27 @@ public int execJplag(String lang,float threshold,String files,List list args.add("-x"); args.add(EXCLUDE_FILES); } + // args.add("-clustertype"); //Խֳ࣬鳭Ϯ + // args.add("avr"); + args.add(INPUT_FILE_FOLDER_NAME); String[] toPass = new String[args.size()]; toPass = args.toArray(toPass); - // System.out.println(toPass.toString()); - // JPlag.main(toPass); - try { - CommandLineOptions options = new CommandLineOptions(toPass, null); - Program program = new Program(options); - - System.out.println("jplag initialize ok "+program.get_commandLine()); - program.run(); - res = 0; //ִгɹ - } - catch(ExitException ex) { - System.out.println("Error: "+ex.getReport()); + + CommandLineOptionsExt options = new CommandLineOptionsExt(toPass, null); - } - + Program program = new Program(options); + + System.out.println("jplag initialize ok "+program.get_commandLine()); + program.run(); + res = 0; //ִгɹ + + } catch(Exception e) { + e.printStackTrace(); + }finally { + postJplag(tmpf); + } + System.out.println("time:"+(System.currentTimeMillis()-t)+"ms"); return res ; } diff --git a/testdata/doccn/dongxiao-2.html b/testdata/doccn/dongxiao-2.html new file mode 100644 index 0000000..197775b --- /dev/null +++ b/testdata/doccn/dongxiao-2.html @@ -0,0 +1,40 @@ + + + + +Insert title here + + +
+2. 单元测试: unit testing 出处:http://www.igsgroup.com.cn/common/ISTQB%E8%BD%AF%E4%BB%B6%E6%B5%8B%E8%AF%95%E4%B8%93%E4%B8%9A%E6%9C%AF%E8%AF%AD%E5%AF%B9%E7%85%A7%E8%A1%A8v2.1.pdf + 定义:依据详细设计规格说明书,对模块内所有重要控制路径设计测试用例,来发现模块内部错误 P94 + 3. 集成测试: integration testing 出处同上 定义:在单元测试的基础上,将所有程序模块进行有序、递增的测试,检验程序单元或部件的接口关系,使之符合要求 P25 + 4. 系统测试:system testing 出处同上 定义:对集成的软件和硬件系统进行的测试 P26 + 5. 验收测试: acceptance testing 出处同上 定义:按照项目要求和合同,供需双方签订的验收文档进行的测试和评审 P26 + 6. 功能测试:functional testing 出处同上 定义:功能测试就是对产品的各功能进行验证,根据功能测试用例,逐项测试,检查产品是否达到用户要求的功能。 出处: http://baike.baidu.com/view/651435.htm + 7. 黑盒测试:black-box testing 出处同上 定义:未知程序内部结构进行的测试 P26 + 8. 白盒测试:white-box testing 出处同上 定义:已知程序内部结构进行的测试 P26 + 9. 性能测试:performance testing 出处同上 定义:用来测试软件在集成系统中的运行性能。P135 + 10. α测试:αtesting 定义:对即将面市的软件产品进行测试 P158 + 11.CMM:Capability Maturity Model for Software 能力成熟度模型 http://baike.baidu.com/view/8110.htm 定义:对于软件组织在定义、实施、度量、控制和改善其软件过程的实践中各个发展阶段的描述 http://baike.baidu.com/view/8110.htm + 12. ISO9000:质量管理体系标准 定义:由TC176(质量管理体系技术委员会)制定的所有国际标准。 http://baike.baidu.com/view/9486.htm +简答题:(2x12) +1 黑盒测试和白盒测试的区别?哪些错误使用黑盒测试更容易发现?哪些错误使用白盒测试更容易发现?各举2例。 +黑盒测试是不知道软件程序内部结构,白盒测试是知道软件程序内部结构。 +黑盒测试便于发现1、是否有不正确或遗漏的功能?2、在接口上,输入是否能正确的接受?能否输出正确的结果? +白盒测试易于发现:1、对所有的逻辑判定,取“真”与取“假”的两种情况都能至少测一遍。2、在循环的边界和运行的界限内执行循环体 +http://zhidao.baidu.com/question/13988876.html +2 集成测试和系统测试的区别和联系? +P132 集成测试对象是模块间的接口,系统测试对象是整个系统。集成测试和系统测试都用到黑盒测试 +问答题:(52) +1 (10)描述软件开发的瀑布模型,并结合自己参与的具体项目,回答以下问题: + 瀑布模型:可行性研究和计划—需求分析—设计—编码—测试—运行维护 http://baike.baidu.com/view/551037.htm +(1) 实际项目开发经历了哪些阶段?(先简单阐述所做的项目) + 做一个航空售票系统。一开始老师说要求(可行性研究和分析),同学们听见后,开始分析老师想要什么东西(需求分析),然后脑子里大概有个相应的想法(设计),开始打代码(编码),最后检查有没有报错,看能不能运行(测试) +(2) 作为程序员,依次写出你认为最重要的3个阶段,并说明原因? + 需求分析,设计,编码。需求分析,只有知道自己想要什么,才知道自己要做成什么东西;设计,有个大体的模子,才能知道该怎么弄;编码,既然是程序员,不编码能叫程序员吗。 + +
+百度 + + \ No newline at end of file