采用Nagao算法統計各個子字符串的頻次,然后基于這些頻次統計每個字符串的詞頻、左右鄰個數、左右熵、交互信息(內部凝聚度)。
名詞解釋:
Nagao算法:一種快速的統計文本里所有子字符串頻次的算法。詳細算法可見http://www.doc88.com/p-664123446503.html
詞頻:該字符串在文檔中出現的次數。出現次數越多越重要。
左右鄰個數:文檔中該字符串的左邊和右邊出現的不同的字的個數。左右鄰越多,說明字符串成詞概率越高。
左右熵:文檔中該字符串的左邊和右邊出現的不同的字的數量分布的熵。類似上面的指標,有一定區別。
交互信息:每次將某字符串分成兩部分,左半部分字符串和右半部分字符串,計算其同時出現的概率除于其各自獨立出現的概率,最后取所有的劃分里面概率最小值。這個值越大,說明字符串內部凝聚度越高,越可能成詞。
算法具體流程:
1. 將輸入文件逐行讀入,按照非漢字([^\u4E00-\u9FA5]+)以及停詞“的很了么呢是嘛個都也比還這于不與才上用就好在和對挺去后沒說”,
分成一個個字符串,代碼如下:
String[] phrases = line.split("[^\u4E00-\u9FA5]+|["+stopwords+"]");
停用詞可以修改。
2. 獲取所有切分后的字符串的左子串和右子串,分別加入左、右PTable
3. 對PTable排序,并計算LTable。LTable記錄的是,排序后的PTable中,下一個子串同上一個子串具有相同字符的數量
4. 遍歷PTable和LTable,即可得到所有子字符串的詞頻、左右鄰
5. 根據所有子字符串的詞頻、左右鄰結果,輸出字符串的詞頻、左右鄰個數、左右熵、交互信息
1. NagaoAlgorithm.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
|
package com.algo.word; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; public class NagaoAlgorithm { private int N; private List<String> leftPTable; private int [] leftLTable; private List<String> rightPTable; private int [] rightLTable; private double wordNumber; private Map<String, TFNeighbor> wordTFNeighbor; private final static String stopwords = "的很了么呢是嘛個都也比還這于不與才上用就好在和對挺去后沒說" ; private NagaoAlgorithm(){ //default N = 5 N = 5 ; leftPTable = new ArrayList<String>(); rightPTable = new ArrayList<String>(); wordTFNeighbor = new HashMap<String, TFNeighbor>(); } //reverse phrase private String reverse(String phrase) { StringBuilder reversePhrase = new StringBuilder(); for ( int i = phrase.length() - 1 ; i >= 0 ; i--) reversePhrase.append(phrase.charAt(i)); return reversePhrase.toString(); } //co-prefix length of s1 and s2 private int coPrefixLength(String s1, String s2){ int coPrefixLength = 0 ; for ( int i = 0 ; i < Math.min(s1.length(), s2.length()); i++){ if (s1.charAt(i) == s2.charAt(i)) coPrefixLength++; else break ; } return coPrefixLength; } //add substring of line to pTable private void addToPTable(String line){ //split line according to consecutive none Chinese character String[] phrases = line.split( "[^\u4E00-\u9FA5]+|[" +stopwords+ "]" ); for (String phrase : phrases){ for ( int i = 0 ; i < phrase.length(); i++) rightPTable.add(phrase.substring(i)); String reversePhrase = reverse(phrase); for ( int i = 0 ; i < reversePhrase.length(); i++) leftPTable.add(reversePhrase.substring(i)); wordNumber += phrase.length(); } } //count lTable private void countLTable(){ Collections.sort(rightPTable); rightLTable = new int [rightPTable.size()]; for ( int i = 1 ; i < rightPTable.size(); i++) rightLTable[i] = coPrefixLength(rightPTable.get(i- 1 ), rightPTable.get(i)); Collections.sort(leftPTable); leftLTable = new int [leftPTable.size()]; for ( int i = 1 ; i < leftPTable.size(); i++) leftLTable[i] = coPrefixLength(leftPTable.get(i- 1 ), leftPTable.get(i)); System.out.println( "Info: [Nagao Algorithm Step 2]: having sorted PTable and counted left and right LTable" ); } //according to pTable and lTable, count statistical result: TF, neighbor distribution private void countTFNeighbor(){ //get TF and right neighbor for ( int pIndex = 0 ; pIndex < rightPTable.size(); pIndex++){ String phrase = rightPTable.get(pIndex); for ( int length = 1 + rightLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = phrase.substring( 0 , length); TFNeighbor tfNeighbor = new TFNeighbor(); tfNeighbor.incrementTF(); if (phrase.length() > length) tfNeighbor.addToRightNeighbor(phrase.charAt(length)); for ( int lIndex = pIndex+ 1 ; lIndex < rightLTable.length; lIndex++){ if (rightLTable[lIndex] >= length){ tfNeighbor.incrementTF(); String coPhrase = rightPTable.get(lIndex); if (coPhrase.length() > length) tfNeighbor.addToRightNeighbor(coPhrase.charAt(length)); } else break ; } wordTFNeighbor.put(word, tfNeighbor); } } //get left neighbor for ( int pIndex = 0 ; pIndex < leftPTable.size(); pIndex++){ String phrase = leftPTable.get(pIndex); for ( int length = 1 + leftLTable[pIndex]; length <= N && length <= phrase.length(); length++){ String word = reverse(phrase.substring( 0 , length)); TFNeighbor tfNeighbor = wordTFNeighbor.get(word); if (phrase.length() > length) tfNeighbor.addToLeftNeighbor(phrase.charAt(length)); for ( int lIndex = pIndex + 1 ; lIndex < leftLTable.length; lIndex++){ if (leftLTable[lIndex] >= length){ String coPhrase = leftPTable.get(lIndex); if (coPhrase.length() > length) tfNeighbor.addToLeftNeighbor(coPhrase.charAt(length)); } else break ; } } } System.out.println( "Info: [Nagao Algorithm Step 3]: having counted TF and Neighbor" ); } //according to wordTFNeighbor, count MI of word private double countMI(String word){ if (word.length() <= 1 ) return 0 ; double coProbability = wordTFNeighbor.get(word).getTF()/wordNumber; List<Double> mi = new ArrayList<Double>(word.length()); for ( int pos = 1 ; pos < word.length(); pos++){ String leftPart = word.substring( 0 , pos); String rightPart = word.substring(pos); double leftProbability = wordTFNeighbor.get(leftPart).getTF()/wordNumber; double rightProbability = wordTFNeighbor.get(rightPart).getTF()/wordNumber; mi.add(coProbability/(leftProbability*rightProbability)); } return Collections.min(mi); } //save TF, (left and right) neighbor number, neighbor entropy, mutual information private void saveTFNeighborInfoMI(String out, String stopList, String[] threshold){ try { //read stop words file Set<String> stopWords = new HashSet<String>(); BufferedReader br = new BufferedReader( new FileReader(stopList)); String line; while ((line = br.readLine()) != null ){ if (line.length() > 1 ) stopWords.add(line); } br.close(); //output words TF, neighbor info, MI BufferedWriter bw = new BufferedWriter( new FileWriter(out)); for (Map.Entry<String, TFNeighbor> entry : wordTFNeighbor.entrySet()){ if ( entry.getKey().length() <= 1 || stopWords.contains(entry.getKey()) ) continue ; TFNeighbor tfNeighbor = entry.getValue(); int tf, leftNeighborNumber, rightNeighborNumber; double mi; tf = tfNeighbor.getTF(); leftNeighborNumber = tfNeighbor.getLeftNeighborNumber(); rightNeighborNumber = tfNeighbor.getRightNeighborNumber(); mi = countMI(entry.getKey()); if (tf > Integer.parseInt(threshold[ 0 ]) && leftNeighborNumber > Integer.parseInt(threshold[ 1 ]) && rightNeighborNumber > Integer.parseInt(threshold[ 2 ]) && mi > Integer.parseInt(threshold[ 3 ]) ){ StringBuilder sb = new StringBuilder(); sb.append(entry.getKey()); sb.append( "," ).append(tf); sb.append( "," ).append(leftNeighborNumber); sb.append( "," ).append(rightNeighborNumber); sb.append( "," ).append(tfNeighbor.getLeftNeighborEntropy()); sb.append( "," ).append(tfNeighbor.getRightNeighborEntropy()); sb.append( "," ).append(mi).append( "\n" ); bw.write(sb.toString()); } } bw.close(); } catch (IOException e) { throw new RuntimeException(e); } System.out.println( "Info: [Nagao Algorithm Step 4]: having saved to file" ); } //apply nagao algorithm to input file public static void applyNagao(String[] inputs, String out, String stopList){ NagaoAlgorithm nagao = new NagaoAlgorithm(); //step 1: add phrases to PTable String line; for (String in : inputs){ try { BufferedReader br = new BufferedReader( new FileReader(in)); while ((line = br.readLine()) != null ){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println( "Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable" ); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, "20,3,3,5" .split( "," )); } public static void applyNagao(String[] inputs, String out, String stopList, int n, String filter){ NagaoAlgorithm nagao = new NagaoAlgorithm(); nagao.setN(n); String[] threshold = filter.split( "," ); if (threshold.length != 4 ){ System.out.println( "ERROR: filter must have 4 numbers, seperated with ',' " ); return ; } //step 1: add phrases to PTable String line; for (String in : inputs){ try { BufferedReader br = new BufferedReader( new FileReader(in)); while ((line = br.readLine()) != null ){ nagao.addToPTable(line); } br.close(); } catch (IOException e) { throw new RuntimeException(); } } System.out.println( "Info: [Nagao Algorithm Step 1]: having added all left and right substrings to PTable" ); //step 2: sort PTable and count LTable nagao.countLTable(); //step3: count TF and Neighbor nagao.countTFNeighbor(); //step4: save TF NeighborInfo and MI nagao.saveTFNeighborInfoMI(out, stopList, threshold); } private void setN( int n){ N = n; } public static void main(String[] args) { String[] ins = { "E://test//ganfen.txt" }; applyNagao(ins, "E://test//out.txt" , "E://test//stoplist.txt" ); } } |
2. TFNeighbor.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
|
package com.algo.word; import java.util.HashMap; import java.util.Map; public class TFNeighbor { private int tf; private Map<Character, Integer> leftNeighbor; private Map<Character, Integer> rightNeighbor; TFNeighbor(){ leftNeighbor = new HashMap<Character, Integer>(); rightNeighbor = new HashMap<Character, Integer>(); } //add word to leftNeighbor public void addToLeftNeighbor( char word){ //leftNeighbor.put(word, 1 + leftNeighbor.getOrDefault(word, 0)); Integer number = leftNeighbor.get(word); leftNeighbor.put(word, number == null ? 1 : 1 +number); } //add word to rightNeighbor public void addToRightNeighbor( char word){ //rightNeighbor.put(word, 1 + rightNeighbor.getOrDefault(word, 0)); Integer number = rightNeighbor.get(word); rightNeighbor.put(word, number == null ? 1 : 1 +number); } //increment tf public void incrementTF(){ tf++; } public int getLeftNeighborNumber(){ return leftNeighbor.size(); } public int getRightNeighborNumber(){ return rightNeighbor.size(); } public double getLeftNeighborEntropy(){ double entropy = 0 ; int sum = 0 ; for ( int number : leftNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if (sum == 0 ) return 0 ; return Math.log(sum) - entropy/sum; } public double getRightNeighborEntropy(){ double entropy = 0 ; int sum = 0 ; for ( int number : rightNeighbor.values()){ entropy += number*Math.log(number); sum += number; } if (sum == 0 ) return 0 ; return Math.log(sum) - entropy/sum; } public int getTF(){ return tf; } } |
3. Main.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
package com.algo.word; public class Main { public static void main(String[] args) { //if 3 arguments, first argument is input files splitting with ',' //second argument is output file //output 7 columns split with ',' , like below: //word, term frequency, left neighbor number, right neighbor number, left neighbor entropy, right neighbor entropy, mutual information //third argument is stop words list if (args.length == 3 ) NagaoAlgorithm.applyNagao(args[ 0 ].split( "," ), args[ 1 ], args[ 2 ]); //if 4 arguments, forth argument is the NGram parameter N //5th argument is threshold of output words, default is "20,3,3,5" //output TF > 20 && (left | right) neighbor number > 3 && MI > 5 else if (args.length == 5 ) NagaoAlgorithm.applyNagao(args[ 0 ].split( "," ), args[ 1 ], args[ 2 ], Integer.parseInt(args[ 3 ]), args[ 4 ]); } } |
以上所述就是本文的全部內容了,希望大家能夠喜歡。