Word2Vec相关(用TFIDF加权词向量)

今天是快乐的清明节,而博主还在实验室敲代码,23333
这次记录下Word2Vec相关的姿势~

Word2Vec模型

直接用开源的gensism库进行词向量训练:

import gensim
import nltk
import numpy as np

#自制语料
CORPUS = [
'the sky is blue',
'sky is blue and sky is beautiful',
'the beautiful sky is so blue',
'i love blue cheese'
]

new_doc = ['loving this blue sky today']

对语料进行分词

#tokenize corpus
TOKENIZED_CORPUS=[nltk.word_tokenize(sentence) for sentence in CORPUS]
tokenized_new_doc=[nltk.word_tokenize(sentence) for sentence in new_doc]
print(TOKENIZED_CORPUS)
print(tokenized_new_doc)

输出

[['the', 'sky', 'is', 'blue'],
['sky', 'is', 'blue', 'and', 'sky', 'is', 'beautiful'],
['the', 'beautiful', 'sky', 'is', 'so', 'blue'],
['i', 'love', 'blue', 'cheese']]

[['loving', 'this', 'blue', 'sky', 'today']]

构建词向量

model=gensim.models.Word2Vec(TOKENIZED_CORPUS,size=10,window=10,min_count=2,sample=1e-3)

平均词向量来表示文档

#num_features表示的文本单词大小
def average_word_vectors(words,model,vocabulary,num_features):
feature_vector=np.zeros((num_features,),dtype='float64')
nwords=0
for word in words:
if word in vocabulary:
nwords=nwords+1
feature_vector=np.add(feature_vector,model[word])
if nwords:
feature_vector=np.divide(feature_vector,nwords)
return feature_vector

def averaged_word_vectorizer(corpus,model,num_features):
#get the all vocabulary
vocabulary=set(model.wv.index2word)
features=[average_word_vectors(tokenized_sentence,model,vocabulary,num_features) for tokenized_sentence in corpus]
return np.array(features)
avg_word_vec_features=averaged_word_vectorizer(TOKENIZED_CORPUS,model=model,num_features=10)
print(avg_word_vec_features)

输出

array([[-0.00710545, -0.01549264,  0.02188712, -0.00322829,  0.00586532,
-0.00687592, 0.00339291, -0.01177494, 0.00265543, -0.00539964],
[-0.0157312 , -0.01630003, 0.00551589, 0.00166568, 0.02385859,
0.0085727 , 0.02538068, -0.02266891, 0.02231819, -0.02521743],
[-0.0070758 , -0.00578274, 0.01280785, -0.00960104, 0.00821758,
-0.00023592, 0.01009926, -0.00624976, 0.00913788, -0.01323305],
[ 0.01571231, -0.02214988, 0.02293927, -0.03584988, -0.02027377,
0.00031135, 0.00284845, 0.01365358, 0.00845861, -0.0247597 ]])

nd_avg_word_vec_features=averaged_word_vectorizer(corpus=tokenized_new_doc,model=model,num_features=10)
print(nd_avg_word_vec_features)

输出

array([[-0.00968785, -0.02889012,  0.02670473, -0.01596956,  0.00815679,
-0.00325876, 0.02226594, -0.01347479, 0.01384218, -0.01042995]])

# TF-IDF加权平均词向量

如果直接求平均效果不好的话,或者过于简单的话,可以对词求TFIDF,然后乘以相应的权重

def tfidf_wtd_avg_word_vectors(words,tfidf_vector,tfidf_vocabulary,model,num_features):
word_tfidfs=[tfidf_vector[0,tfidf_vocabulary.get(word)] if tfidf_vocabulary.get(word) else 0 for word in words]
word_tfidf_map={word:tfidf_val for word,tfidf_val in zip(words,word_tfidfs)}
feature_vector=np.zeros((num_features,),dtype='float64')
vocabulary=set(model.wv.index2word)
wts=0
for word in words:
if word in vocabulary:
word_vector=model[word]
weighted_word_vector=word_tfidf_map[word]*word_vector
wts=wts+word_tfidf_map[word]
feature_vector=np.add(feature_vector,weighted_word_vector)
if wts:
feature_vector=np.divide(feature_vector,wts)
return feature_vector

def tfidf_weighted_averaged_word_vectorizer(corpus,tfidf_vectors,tfidf_vocabulary,model,num_features):
docs_tfidfs=[(doc,doc_tfidf) for doc,doc_tfidf in zip(corpus,tfidf_vectors)]
features=[tfidf_wtd_avg_word_vectors(tokenized_sentence,tfidf,tfidf_vocabulary,model,num_features) for tokenized_sentence,tfidf in docs_tfidfs]
return np.array(features)

TFIDF预处理

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

def tfidf_transformer(bow_matrix):

transformer = TfidfTransformer(norm='l2',
smooth_idf=True,
use_idf=True)
tfidf_matrix = transformer.fit_transform(bow_matrix)
return transformer, tfidf_matrix

def tfidf_extractor(corpus, ngram_range=(1,1)):

vectorizer = TfidfVectorizer(min_df=1,
norm='l2',
smooth_idf=True,
use_idf=True,
ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features

def bow_extractor(corpus, ngram_range=(1,1)):
#min_df为1说明文档中词频最小为1也会被考虑
#ngram_range可以设置(1,3)将建立包括所有unigram、bigram、trigram的向量空间
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features


def display_features(features, feature_names):
df = pd.DataFrame(data=features,
columns=feature_names)
print(df)

bow_vectorizer, bow_features = bow_extractor(CORPUS)
feature_names = bow_vectorizer.get_feature_names()
tfidf_trans, tfidf_features = tfidf_transformer(bow_features)
tfidf_vectorizer, tdidf_features = tfidf_extractor(CORPUS)
display_features(np.round(tdidf_features.todense(), 2), feature_names)
nd_tfidf = tfidf_vectorizer.transform(new_doc)
display_features(np.round(nd_tfidf.todense(), 2), feature_names)

TFIDF加权词向量

corpus_tfidf=tfidf_features
vocab=tfidf_vectorizer.vocabulary_

wt_tfidf_word_vec_features=tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,tfidf_vectors=corpus_tfidf,tfidf_vocabulary=vocab,model=model,num_features=10)

print(wt_tfidf_word_vec_features)

输出

array([[-0.00728862, -0.01345045,  0.02334223, -0.00258989,  0.00500905,
-0.00913428, 0.00057808, -0.01095917, -0.00025702, -0.00165257],
[-0.02009719, -0.01936696, 0.0056747 , 0.00887485, 0.02952368,
0.00819392, 0.02715274, -0.0298718 , 0.02297843, -0.0237992 ],
[-0.00721121, -0.00258696, 0.01239834, -0.01018197, 0.00795635,
-0.00085167, 0.00906817, -0.00469667, 0.00799437, -0.01167674],
[ 0.01571231, -0.02214988, 0.02293927, -0.03584988, -0.02027377,
0.00031135, 0.00284845, 0.01365358, 0.0084586 , -0.0247597 ]])

nd_wt_tfidf_word_vec_features=tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_new_doc,tfidf_vectors=nd_tfidf,tfidf_vocabulary=vocab,model=model,num_features=10)
print(nd_wt_tfidf_word_vec_features)

输出

array([[-0.01223734, -0.02956665,  0.02708268, -0.01397412,  0.01101045,
-0.00361711, 0.02421493, -0.01619775, 0.01438254, -0.00899163]])
Author: CinKate
Link: http://renxingkai.github.io/2019/04/05/word-tfidf/
Copyright Notice: All articles in this blog are licensed under CC BY-NC-SA 4.0 unless stating additionally.