今天是快乐的清明节,而博主还在实验室敲代码,23333
这次记录下Word2Vec相关的姿势~
Word2Vec模型
直接用开源的gensism库进行词向量训练:
import gensim |
对语料进行分词
#tokenize corpus |
输出
[['the', 'sky', 'is', 'blue'], |
构建词向量
model=gensim.models.Word2Vec(TOKENIZED_CORPUS,size=10,window=10,min_count=2,sample=1e-3) |
平均词向量来表示文档
#num_features表示的文本单词大小 |
avg_word_vec_features=averaged_word_vectorizer(TOKENIZED_CORPUS,model=model,num_features=10) |
输出array([[-0.00710545, -0.01549264, 0.02188712, -0.00322829, 0.00586532,
-0.00687592, 0.00339291, -0.01177494, 0.00265543, -0.00539964],
[-0.0157312 , -0.01630003, 0.00551589, 0.00166568, 0.02385859,
0.0085727 , 0.02538068, -0.02266891, 0.02231819, -0.02521743],
[-0.0070758 , -0.00578274, 0.01280785, -0.00960104, 0.00821758,
-0.00023592, 0.01009926, -0.00624976, 0.00913788, -0.01323305],
[ 0.01571231, -0.02214988, 0.02293927, -0.03584988, -0.02027377,
0.00031135, 0.00284845, 0.01365358, 0.00845861, -0.0247597 ]])
nd_avg_word_vec_features=averaged_word_vectorizer(corpus=tokenized_new_doc,model=model,num_features=10) |
输出array([[-0.00968785, -0.02889012, 0.02670473, -0.01596956, 0.00815679,
-0.00325876, 0.02226594, -0.01347479, 0.01384218, -0.01042995]])
# TF-IDF加权平均词向量
如果直接求平均效果不好的话,或者过于简单的话,可以对词求TFIDF,然后乘以相应的权重
def tfidf_wtd_avg_word_vectors(words,tfidf_vector,tfidf_vocabulary,model,num_features): |
TFIDF预处理from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
def tfidf_transformer(bow_matrix):
transformer = TfidfTransformer(norm='l2',
smooth_idf=True,
use_idf=True)
tfidf_matrix = transformer.fit_transform(bow_matrix)
return transformer, tfidf_matrix
def tfidf_extractor(corpus, ngram_range=(1,1)):
vectorizer = TfidfVectorizer(min_df=1,
norm='l2',
smooth_idf=True,
use_idf=True,
ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
def bow_extractor(corpus, ngram_range=(1,1)):
#min_df为1说明文档中词频最小为1也会被考虑
#ngram_range可以设置(1,3)将建立包括所有unigram、bigram、trigram的向量空间
vectorizer = CountVectorizer(min_df=1, ngram_range=ngram_range)
features = vectorizer.fit_transform(corpus)
return vectorizer, features
def display_features(features, feature_names):
df = pd.DataFrame(data=features,
columns=feature_names)
print(df)
bow_vectorizer, bow_features = bow_extractor(CORPUS) |
TFIDF加权词向量corpus_tfidf=tfidf_features
vocab=tfidf_vectorizer.vocabulary_
wt_tfidf_word_vec_features=tfidf_weighted_averaged_word_vectorizer(corpus=TOKENIZED_CORPUS,tfidf_vectors=corpus_tfidf,tfidf_vocabulary=vocab,model=model,num_features=10) |
输出array([[-0.00728862, -0.01345045, 0.02334223, -0.00258989, 0.00500905,
-0.00913428, 0.00057808, -0.01095917, -0.00025702, -0.00165257],
[-0.02009719, -0.01936696, 0.0056747 , 0.00887485, 0.02952368,
0.00819392, 0.02715274, -0.0298718 , 0.02297843, -0.0237992 ],
[-0.00721121, -0.00258696, 0.01239834, -0.01018197, 0.00795635,
-0.00085167, 0.00906817, -0.00469667, 0.00799437, -0.01167674],
[ 0.01571231, -0.02214988, 0.02293927, -0.03584988, -0.02027377,
0.00031135, 0.00284845, 0.01365358, 0.0084586 , -0.0247597 ]])
nd_wt_tfidf_word_vec_features=tfidf_weighted_averaged_word_vectorizer(corpus=tokenized_new_doc,tfidf_vectors=nd_tfidf,tfidf_vocabulary=vocab,model=model,num_features=10) |
输出
array([[-0.01223734, -0.02956665, 0.02708268, -0.01397412, 0.01101045, |