基于NLTK的TF-IDF关键词抽取

基于nltk总结了用TF-IDF提取关键词的方法,同时总结了文本标准化(预处理),SVD分解、基于TF-IDF、词频等的关键词抽取

SVD奇异值分解

from scipy.sparse.linalg import svds
import re
import nltk
import unicodedata

def low_rank_svd(matrix,singular_count=2):
u,s,vt=svds(matrix,k=singular_count)
return u,s,vt
删除换行,进行分句
def parse_document(document):
document=re.sub('\n',' ',document)
if isinstance(document,str):
document=document
elif isinstance(document,unicode):
return unicodedata.normalize('NFKD',document).encode('ascii','ignore')
else:
raise ValueError('Document is not string or unicode!')
document=document.strip()
sentences=nltk.sent_tokenize(document)
sentences=[sentence.strip() for sentence in sentences]
return sentences
转移HTML标签

from html.parser import HTMLParser

html_parser=HTMLParser()
def unescape_html(parser,text):
return parser.unescape_html(text)

缩写词表
CONTRACTION_MAP = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

文本标准化

import string
from nltk.stem import WordNetLemmatizer

stopword_list = nltk.corpus.stopwords.words('english')
wnl = WordNetLemmatizer()
html_parser = HTMLParser()
文本分词
def tokenize_text(text):
tokens = nltk.word_tokenize(text)
tokens = [token.strip() for token in tokens]
return tokens
扩展缩写
def expand_contractions(text, contraction_mapping):

contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match if contraction_mapping.get(match) else contraction_mapping.get(match.lower()))
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction

expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
标记文本词性
from pattern.en import tag
from nltk.corpus import wordnet as wn

# 标记文本词性
def pos_tag_text(text):

def penn_to_wn_tags(pos_tag):
if pos_tag.startswith('J'):
return wn.ADJ
elif pos_tag.startswith('V'):
return wn.VERB
elif pos_tag.startswith('N'):
return wn.NOUN
elif pos_tag.startswith('R'):
return wn.ADV
else:
return None

tagged_text = tag(text)
tagged_lower_text = [(word.lower(), penn_to_wn_tags(pos_tag)) for word, pos_tag in tagged_text]
return tagged_lower_text
基于词性标签提取主干词
def lemmatize_text(text):

pos_tagged_text = pos_tag_text(text)
lemmatized_tokens = [wnl.lemmatize(word, pos_tag) if pos_tag
else word
for word, pos_tag in pos_tagged_text]
lemmatized_text = ' '.join(lemmatized_tokens)
return lemmatized_text
删除特殊字符
def remove_special_characters(text):
tokens = tokenize_text(text)
pattern = re.compile('[{}]'.format(re.escape(string.punctuation)))
filtered_tokens = filter(None, [pattern.sub(' ', token) for token in tokens])
filtered_text = ' '.join(filtered_tokens)
return filtered_text
删除停用词
def remove_stopwords(text):
tokens = tokenize_text(text)
filtered_tokens = [token for token in tokens if token not in stopword_list]
filtered_text = ' '.join(filtered_tokens)
return filtered_text
转移HTML标签
def unescape_html(parser, text):
return parser.unescape(text)
标准化文本(合并执行上面流程)
def normalize_corpus(corpus, lemmatize=True, tokenize=False):

normalized_corpus = []
for text in corpus:
text = html_parser.unescape(text)
text = expand_contractions(text, CONTRACTION_MAP)
if lemmatize:
text = lemmatize_text(text)
else:
text = text.lower()
text = remove_special_characters(text)
text = remove_stopwords(text)
if tokenize:
text = tokenize_text(text)
normalized_corpus.append(text)
else:
normalized_corpus.append(text)

return normalized_corpus

文本特征提取

  • 基于词项次数的二值特征
  • 基于词袋模型的频率特征
  • TF-IDF权重特征
构建特征矩阵binary、frequency、tfidf
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
def build_feature_matrix(documents,feature_type='frequency'):
feature_type=feature_type.lower().strip()
if feature_type=='binary':
vectorizer=CountVectorizer(binary=True,min_df=1,ngram_range=(1,1))
elif feature_type=='frequency':
vectorizer=CountVectorizer(binary=False,min_df=1,ngram_range=(1,1))
elif feature_type=='tfidf':
vectorizer=TfidfVectorizer(min_df=1,ngram_range=(1,1))
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix=vectorizer.fit_transform(documents).astype(float)
return vectorizer,feature_matrix

关键短语提取

词项搭配

from nltk.corpus import gutenberg
import nltk
from operator import itemgetter

alice = gutenberg.sents(fileids='carroll-alice.txt')
alice = [' '.join(ts) for ts in alice]
norm_alice = normalize_corpus(alice, lemmatize=False)
将语料压缩成1个大的文本串
def flatten_corpus(corpus):
return ' '.join([document.strip()
for document in corpus])
计算n元分词(比较巧妙)
def compute_ngrams(sequence,n):
# print([sequence[index:] for index in range(n)])
# print(list(zip(*[sequence[index:] for index in range(n)])))
#解压时仅按最小元素数组数量进行解压
return zip(*[sequence[index:] for index in range(n)])
获取n元分词
def get_top_ngram(corpus,ngram_val=1,limit=5):
corpus=flatten_corpus(corpus)
tokens=nltk.word_tokenize(corpus)

ngrams=compute_ngrams(tokens,ngram_val)
#获取单词频率
ngrams_freq_dist=nltk.FreqDist(ngrams)
#排序频率
sorted_ngrams_fd=sorted(ngrams_freq_dist.items(),key=itemgetter(1),reverse=True)
sorted_ngrams=sorted_ngrams_fd[0:limit]
sorted_ngrams=[(' '.join(text),freq) for text,freq in sorted_ngrams]
return sorted_ngrams
输出频率前10的二元分词
get_top_ngram(corpus=norm_alice,ngram_val=2,limit=10)

输出结果

[('said alice', 123),
('mock turtle', 56),
('march hare', 31),
('said king', 29),
('thought alice', 26),
('white rabbit', 22),
('said hatter', 22),
('said mock', 20),
('said caterpillar', 18),
('said gryphon', 18)]

频率前10的三元分词
get_top_ngram(corpus=norm_alice,ngram_val=3,limit=10)

输出结果

[('said mock turtle', 20),
('said march hare', 9),
('poor little thing', 6),
('little golden key', 5),
('certainly said alice', 5),
('white kid gloves', 5),
('march hare said', 5),
('mock turtle said', 5),
('know said alice', 4),
('might well say', 4)]
频率前10的一元分词
get_top_ngram(corpus=norm_alice,ngram_val=1,limit=10)

输出结果

[('said', 462),
('alice', 398),
('little', 128),
('one', 104),
('know', 88),
('like', 85),
('would', 83),
('went', 83),
('could', 77),
('queen', 75)]

使用nltk的搭配查找器

二元词项
from nltk.collocations import BigramCollocationFinder
from nltk.collocations import BigramAssocMeasures

finder=BigramCollocationFinder.from_documents([item.split() for item in norm_alice])

bigram_measures=BigramAssocMeasures()

#使用原始频率进行查找
finder.nbest(bigram_measures.raw_freq,10)

输出结果

[('said', 'alice'),
('mock', 'turtle'),
('march', 'hare'),
('said', 'king'),
('thought', 'alice'),
('said', 'hatter'),
('white', 'rabbit'),
('said', 'mock'),
('said', 'caterpillar'),
('said', 'gryphon')]
二元使用点互信息PMI进行查找搭配
finder.nbest(bigram_measures.pmi,10)
三元词组
from nltk.collocations import TrigramAssocMeasures
from nltk.collocations import TrigramCollocationFinder

finder=TrigramCollocationFinder.from_documents([item.split() for item in norm_alice])

trigram_measures=TrigramAssocMeasures()

#三元组频率
finder.nbest(trigram_measures.raw_freq,10)

输出结果

[('said', 'mock', 'turtle'),
('said', 'march', 'hare'),
('poor', 'little', 'thing'),
('little', 'golden', 'key'),
('march', 'hare', 'said'),
('mock', 'turtle', 'said'),
('white', 'kid', 'gloves'),
('beau', 'ootiful', 'soo'),
('certainly', 'said', 'alice'),
('might', 'well', 'say')]

三元使用点互信息PMI进行查找搭配
finder.nbest(trigram_measures.pmi,10)

输出结果

[('accustomed', 'usurpation', 'conquest'),
('adjourn', 'immediate', 'adoption'),
('adoption', 'energetic', 'remedies'),
('ancient', 'modern', 'seaography'),
('apple', 'roast', 'turkey'),
('arithmetic', 'ambition', 'distraction'),
('brother', 'latin', 'grammar'),
('canvas', 'bag', 'tied'),
('cherry', 'tart', 'custard'),
('circle', 'exact', 'shape')]

基于权重标签的短语提取

  • 使用浅层分析提取所有名词短语词块
  • 计算每个词块的TF-IDF权重并返回最大加权短语
toy_text = """
Elephants are large mammals of the family Elephantidae
and the order Proboscidea. Two species are traditionally recognised,
the African elephant and the Asian elephant. Elephants are scattered
throughout sub-Saharan Africa, South Asia, and Southeast Asia. Male
African elephants are the largest extant terrestrial animals. All
elephants have a long trunk used for many purposes,
particularly breathing, lifting water and grasping objects. Their
incisors grow into tusks, which can serve as weapons and as tools
for moving objects and digging. Elephants' large ear flaps help
to control their body temperature. Their pillar-like legs can
carry their great weight. African elephants have larger ears
and concave backs while Asian elephants have smaller ears
and convex or level backs.
"""
import numpy as np
import itertools
from gensim import corpora, models

基本上,我们有一个已定义的语法模式来分块或提取名词短语。我们在同一模式中定义一个分块器,对于文档中的每个句子,首先用它的POS标签来标注它(因此,不应该对文本进行规范化),然后构建一个具有名词短语的浅层分析树作为词块和其他全部基于POS标签的单词作为缝隙,缝隙是不属于任何词块的部分。完成此操作后,我们使用tree2conl1tags函数来生成(w, t,c)三元组,它们是的单词、POS标签和IOB格式的词块标签。删除所有词块带有’O ‘标签的标签,因为它们基本上是不属于任何词块的单词或词项。最后,从这些有效的词块中,组合分块的词项,并从每个词块分组中生成短语。

提取文档中的名词短语 v adj adv
#提取文档中的名词短语 v adj adv
def get_chunks(sentences, grammar = r'NP: {<DT>? <JJ>* <NN.*>+}'):

all_chunks = []
chunker = nltk.chunk.regexp.RegexpParser(grammar)

for sentence in sentences:

tagged_sents = nltk.pos_tag_sents(
[nltk.word_tokenize(sentence)])

chunks = [chunker.parse(tagged_sent)
for tagged_sent in tagged_sents]

wtc_sents = [nltk.chunk.tree2conlltags(chunk)
for chunk in chunks]

flattened_chunks = list(
itertools.chain.from_iterable(
wtc_sent for wtc_sent in wtc_sents)
)
# print(flattened_chunks)
# print(flattened_chunks)
valid_chunks_tagged = [(status, [wtc for wtc in chunk])
for status, chunk in itertools.groupby(flattened_chunks,lambda chunk: chunk != 'O')]
# print('---'*20)
# print(valid_chunks_tagged)
valid_chunks = [' '.join(word.lower()
for word, tag, chunk
in wtc_group
if word.lower()
not in stopword_list)
for status, wtc_group
in valid_chunks_tagged
if status]
all_chunks.append(valid_chunks)

return all_chunks
sentences = parse_document(toy_text)          
valid_chunks = get_chunks(sentences)
获取TF-IDF关键短语权重
def get_tfidf_weighted_keyphrases(sentences, 
grammar=r'NP: {<DT>? <JJ>* <NN.*>+}',
top_n=10):

valid_chunks = get_chunks(sentences, grammar=grammar)

dictionary = corpora.Dictionary(valid_chunks)
corpus = [dictionary.doc2bow(chunk) for chunk in valid_chunks]

tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

weighted_phrases = {dictionary.get(id): round(value,3)
for doc in corpus_tfidf
for id, value in doc}

weighted_phrases = sorted(weighted_phrases.items(),
key=itemgetter(1), reverse=True)

return weighted_phrases[:top_n]
前两个关键短语
get_tfidf_weighted_keyphrases(sentences, top_n=2)

输出结果

[('elephants large mammals family elephantidae order proboscidea .', 1.0),
('two species traditionally recognised , african elephant asian elephant .',
1.0)]
其他语料实验
get_tfidf_weighted_keyphrases(alice, top_n=10)

输出结果

[("[ alice ' adventures wonderland lewis carroll 1865 ]", 1.0),
('chapter .', 1.0),
('rabbit - hole', 1.0),
("alice beginning get tired sitting sister bank , nothing : twice peeped book sister reading , pictures conversations , ' use book , ' thought alice ' without pictures conversation ? '",
1.0),
('considering mind ( well could , hot day made feel sleepy stupid ) , whether pleasure making daisy - chain would worth trouble getting picking daisies , suddenly white rabbit pink eyes ran close .',
1.0),
("nothing remarkable ; alice think much way hear rabbit say , ' oh dear !",
1.0),
('oh dear !', 1.0),
("shall late ! '", 1.0),
('( thought afterwards , occurred ought wondered , time seemed quite natural ) ; rabbit actually took watch waistcoat - pocket , looked , hurried , alice started feet , flashed across mind never seen rabbit either waistcoat - pocket , watch take , burning curiosity , ran across field , fortunately time see pop large rabbit - hole hedge .',
1.0),
('another moment went alice , never considering world get .', 1.0)]
Author: CinKate
Link: http://renxingkai.github.io/2019/04/10/tfidfkeyextraction/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.