-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpreproc_ja.js
60 lines (52 loc) · 4.53 KB
/
preproc_ja.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
function preproc_ja(text, summary_number, word_list_minimum, separator){
not_word_array = ["あそこ","あたり","あちら","あっち","あと","あな","あなた","あれ","いくつ","いつ","いま","いや","いろいろ","うち","おおまか","おまえ","おれ","がい","かく","かたち","かやの","から","がら","きた","くせ","ここ","こっち","こと","ごと","こちら","ごっちゃ","これ","これら","ごろ","さまざま","さらい","さん","しかた","しよう","すか","ずつ","すね","すべて","ぜんぶ","そう","そこ","そちら","そっち","そで","それ","それぞれ","それなり","たくさん","たち","たび","ため","だめ","ちゃ","ちゃん","てん","とおり","とき","どこ","どこか","ところ","どちら","どっか","どっち","どれ","なか","なかば","なに","など","なん","はじめ","はず","はるか","ひと","ひとつ","ふく","ぶり","べつ","へん","ぺん","ほう","ほか","まさ","まし","まとも","まま","みたい","みつ","みなさん","みんな","もと","もの","もん","やつ","よう","よそ","わけ","わたし","ハイ","上","中","下","字","年","月","日","時","分","秒","週","火","水","木","金","土","国","都","道","府","県","市","区","町","村","各","第","方","何","的","度","文","者","性","体","人","他","今","部","課","係","外","類","達","気","室","口","誰","用","界","会","首","男","女","別","話","私","屋","店","家","場","等","見","際","観","段","略","例","系","論","形","間","地","員","線","点","書","品","力","法","感","作","元","手","数","彼","彼女","子","内","楽","喜","怒","哀","輪","頃","化","境","俺","奴","高","校","婦","伸","紀","誌","レ","行","列","事","士","台","集","様","所","歴","器","名","情","連","毎","式","簿","回","匹","個","席","束","歳","目","通","面","円","玉","枚","前","後","左","右","次","先","春","夏","秋","冬","一","二","三","四","五","六","七","八","九","十","百","千","万","億","兆","下記","上記","時間","今回","前回","場合","一つ","年生","自分","ヶ所","ヵ所","カ所","箇所","ヶ月","ヵ月","カ月","箇月","名前","本当","確か","時点","全部","関係","近く","方法","我々","違い","多く","扱い","新た","その後","半ば","結局","様々","以前","以後","以降","未満","以上","以下","幾つ","毎日","自体","向こう","何人","手段","同じ","感じ","あ","い","う","え","お","か","き","く","け","こ","さ","し","す","せ","そ","た","ち","つ","て","と","な","に","ぬ","ね","の","は","ひ","ふ","へ","ほ","ま","み","む","め","も","や","ゆ","よ","ら","り","る","れ","ろ","わ","を","ん","が","ぎ","ぐ","げ","ご","ざ","じ","ず","ぜ","ぞ","だ","ぢ","づ","で","ど","ば","び","ぶ","べ","ぼ","ぱ","ぴ","ぷ","ぺ","ぽ","。","、","です","ます","である"];
var not_word = not_word_array.reduce(function(m, a, i) {
m[a] = (m[a] || []).concat(i);
return m;
}, {});
var segmenter = new TinySegmenter();
var word_weight = {};
var sens_list = [];
// separator (bad hack...)
for(var i=0; i<separator.length; ++i){
if(separator[i]=="n"){
text =text.replace(/\r?\n/g, "。");
}else{
text =text.replace(separator[i], "。");
}
}
text = text.replace(".","。");
var sentences = text.split("。");
for(var i=0; i<sentences.length; ++i){
var word_list_temp = segmenter.segment(sentences[i]);
console.log(word_list_temp);
var sens = {"id":i, "word_list":[], "importance":0};
for (var j=0; j< word_list_temp.length; ++j){
if (!(word_list_temp[j].toLowerCase() in not_word)){
sens["word_list"].push(word_list_temp[j].toLowerCase());
}
}
for (var j=0; j< sens["word_list"].length; ++j){
if (sens["word_list"][j] in word_weight){
word_weight[sens["word_list"][j]] += 1;
}else{
word_weight[sens["word_list"][j]] = 1;
}
}
sens_list.push(sens);
}
set_importance(sens_list, word_weight, word_list_minimum);
/*
console.log(sentences);
console.log(sens_list);
console.log(word_weight);
*/
var summary_id = binary_search(sens_list, word_weight, summary_number, 5);
//console.log(summary_id);
var summary_txt = []
for (var i=0; i<summary_id.length; ++i){
summary_txt.push(sentences[summary_id[i]].replace(/^[\s\n]+/g, "").replace(/[\s\n]+$/g, ""));
}
// console.log(summary_txt);
return summary_txt;
}