-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrake.py
More file actions
183 lines (131 loc) · 5.44 KB
/
rake.py
File metadata and controls
183 lines (131 loc) · 5.44 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#coding=utf-8
from __future__ import absolute_import
import re
import operator
import six
from six.moves import range
debug = False
test = False
def is_number(s):
try:
float(s) if '.' in s else int(s)
return True
except ValueError:
return False
def load_stop_words(stop_word_file):
#传入停止词列表,获取没意义的词语
stop_words = []
for line in open(stop_word_file):
if line.strip()[0:1] != "#":
for word in line.split(): #对每一行进行拆分
stop_words.append(word)
return stop_words
def separate_words(text, min_word_return_size):
#这个函数返回一个列表,列表内是进行分词以后的内容
splitter = re.compile('[^a-zA-Z0-9_\\+\\-/]')
words = []
for single_word in splitter.split(text):
current_word = single_word.strip().lower()
if len(current_word) > min_word_return_size and current_word != '' and not is_number(current_word):
words.append(current_word)
return words
def split_sentences(text):
sentence_delimiters = re.compile(u'[\\[\\]\n.!?,;:\t\\-\\"\\(\\)\\\'\u2019\u2013]')
sentences = sentence_delimiters.split(text)
return sentences
def build_stop_word_regex(stop_word_file_path):
stop_word_list = load_stop_words(stop_word_file_path)
stop_word_regex_list = []
for word in stop_word_list:
word_regex = '\\b' + word + '\\b'
stop_word_regex_list.append(word_regex)
stop_word_pattern = re.compile('|'.join(stop_word_regex_list), re.IGNORECASE)
return stop_word_pattern
def generate_candidate_keywords(sentence_list, stopword_pattern, min_char_length=1, max_words_length=5):
phrase_list = []
for s in sentence_list:
tmp = re.sub(stopword_pattern, '|', s.strip())
phrases = tmp.split("|")
for phrase in phrases:
phrase = phrase.strip().lower()
if phrase != "" and is_acceptable(phrase, min_char_length, max_words_length):
phrase_list.append(phrase)
return phrase_list
def is_acceptable(phrase, min_char_length, max_words_length):
if len(phrase) < min_char_length:
return 0
words = phrase.split()
if len(words) > max_words_length:
return 0
digits = 0
alpha = 0
for i in range(0, len(phrase)):
if phrase[i].isdigit():
digits += 1
elif phrase[i].isalpha():
alpha += 1
if alpha == 0:
return 0
if digits > alpha:
return 0
return 1
def calculate_word_scores(phraseList):
word_frequency = {}
word_degree = {}
for phrase in phraseList:
word_list = separate_words(phrase, 0)
word_list_length = len(word_list)
word_list_degree = word_list_length - 1
for word in word_list:
word_frequency.setdefault(word, 0)
word_frequency[word] += 1
word_degree.setdefault(word, 0)
word_degree[word] += word_list_degree
for item in word_frequency:
word_degree[item] = word_degree[item] + word_frequency[item]
word_score = {}
for item in word_frequency:
word_score.setdefault(item, 0)
word_score[item] = word_degree[item] / (word_frequency[item] * 1.0)
return word_score
def generate_candidate_keyword_scores(phrase_list, word_score, min_keyword_frequency=1):
keyword_candidates = {}
for phrase in phrase_list:
if min_keyword_frequency > 1:
if phrase_list.count(phrase) < min_keyword_frequency:
continue
keyword_candidates.setdefault(phrase, 0)
word_list = separate_words(phrase, 0)
candidate_score = 0
for word in word_list:
candidate_score += word_score[word]
keyword_candidates[phrase] = candidate_score
return keyword_candidates
class Rake(object):
def __init__(self, stop_words_path, min_char_length=1, max_words_length=5, min_keyword_frequency=1):
self.__stop_words_path = stop_words_path
self.__stop_words_pattern = build_stop_word_regex(stop_words_path)
self.__min_char_length = min_char_length
self.__max_words_length = max_words_length
self.__min_keyword_frequency = min_keyword_frequency
def run(self, text):
sentence_list = split_sentences(text)
phrase_list = generate_candidate_keywords(sentence_list, self.__stop_words_pattern, self.__min_char_length, self.__max_words_length)
word_scores = calculate_word_scores(phrase_list)
keyword_candidates = generate_candidate_keyword_scores(phrase_list, word_scores, self.__min_keyword_frequency)
sorted_keywords = sorted(six.iteritems(keyword_candidates), key=operator.itemgetter(1), reverse=True)
return sorted_keywords
def select_kw(text):
sentenceList = split_sentences(text)
stoppath = "SmartStoplist.txt"
stopwordpattern = build_stop_word_regex(stoppath)
phraseList = generate_candidate_keywords(sentenceList, stopwordpattern)
wordscores = calculate_word_scores(phraseList)
keywordcandidates = generate_candidate_keyword_scores(phraseList, wordscores)
sortedKeywords = sorted(six.iteritems(keywordcandidates), key=operator.itemgetter(1), reverse=True)
totalKeywords = len(sortedKeywords)
#print sortedKeywords[0:(totalKeywords // 3)]
rake = Rake("SmartStoplist.txt")
keywords = rake.run(text)
#print keywords
return keywords