import nltk from nltk.corpus import gutenberg from pprint import pprint
#载入语料 alice=gutenberg.raw(fileids='carroll-alice.txt') sample_text = 'We will discuss briefly about the basic syntax,\ structure and design philosophies. \ There is a defined hierarchical syntax for Python code which you should remember \ when writing code! Python is a really powerful programming language!' # Total characters in Alice in Wonderland print(len(alice)) # First 100 characters in the corpus print(alice[0:100])
输出:
144395 [Alice's Adventures in Wonderland by Lewis Carroll 1865]
CHAPTER I. Down the Rabbit-Hole
Alice was
1.1.1默认分词器–nltk.sent_tokenize
#默认分词器 default_st = nltk.sent_tokenize alice_sentences = default_st(text=alice) sample_sentences = default_st(text=sample_text) print('Total sentences in sample_text:', len(sample_sentences)) print('Sample text sentences :-') pprint(sample_sentences) print('\nTotal sentences in alice:', len(alice_sentences)) print('First 5 sentences in alice:-') pprint(alice_sentences[0:5])
输出:
Total sentences in sample_text: 3 Sample text sentences :- ['We will discuss briefly about the basic syntax, structure and design ' 'philosophies.', 'There is a defined hierarchical syntax for Python code which you should ' 'remember when writing code!', 'Python is a really powerful programming language!']
Total sentences in alice: 1625 First 5 sentences in alice:- ["[Alice's Adventures in Wonderland by Lewis Carroll 1865]\n\nCHAPTER I.", 'Down the Rabbit-Hole\n' '\n' 'Alice was beginning to get very tired of sitting by her sister on the\n' 'bank, and of having nothing to do: once or twice she had peeped into the\n' 'book her sister was reading, but it had no pictures or conversations in\n' "it, 'and what is the use of a book,' thought Alice 'without pictures or\n" "conversation?'", 'So she was considering in her own mind (as well as she could, for the\n' 'hot day made her feel very sleepy and stupid), whether the pleasure\n' 'of making a daisy-chain would be worth the trouble of getting up and\n' 'picking the daisies, when suddenly a White Rabbit with pink eyes ran\n' 'close by her.', 'There was nothing so VERY remarkable in that; nor did Alice think it so\n' "VERY much out of the way to hear the Rabbit say to itself, 'Oh dear!", 'Oh dear!']
# loading german text tokenizer into a PunktSentenceTokenizer instance german_tokenizer = nltk.data.load(resource_url='tokenizers/punkt/german.pickle') german_sentences = german_tokenizer.tokenize(german_text)
# verify the type of german_tokenizer # should be PunktSentenceTokenizer print(type(german_tokenizer))
# check if results of both tokenizers match # should be True print(german_sentences_def == german_sentences) # print first 5 sentences of the corpus for sent in german_sentences[0:5]: print(sent)
输出:
<class 'nltk.tokenize.punkt.PunktSentenceTokenizer'> True Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten . Wie Sie feststellen konnten , ist der gefürchtete " Millenium-Bug " nicht eingetreten . Doch sind Bürger einiger unserer Mitgliedstaaten Opfer von schrecklichen Naturkatastrophen geworden . Im Parlament besteht der Wunsch nach einer Aussprache im Verlauf dieser Sitzungsperiode in den nächsten Tagen . Heute möchte ich Sie bitten - das ist auch der Wunsch einiger Kolleginnen und Kollegen - , allen Opfern der Stürme , insbesondere in den verschiedenen Ländern der Europäischen Union , in einer Schweigeminute zu gedenken .
1.1.2使用PunktSentenceTokenizer
## using PunktSentenceTokenizer for sentence tokenization punkt_st = nltk.tokenize.PunktSentenceTokenizer() sample_sentences = punkt_st.tokenize(sample_text) pprint(sample_sentences)
输出:
['We will discuss briefly about the basic syntax, structure and design ' 'philosophies.', 'There is a defined hierarchical syntax for Python code which you should ' 'remember when writing code!', 'Python is a really powerful programming language!']
1.1.3使用RegexpTokenizer
#使用正则表达式做句子切分 ## using RegexpTokenizer for sentence tokenization SENTENCE_TOKENS_PATTERN = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s' regex_st = nltk.tokenize.RegexpTokenizer( pattern=SENTENCE_TOKENS_PATTERN, gaps=True) sample_sentences = regex_st.tokenize(sample_text) pprint(sample_sentences)
输出:
['We will discuss briefly about the basic syntax, structure and design ' 'philosophies.', ' There is a defined hierarchical syntax for Python code which you should ' 'remember when writing code!', 'Python is a really powerful programming language!']
1.2词语切分
1.2.1默认分词器nltk.word_tokenize
## 分词 sentence = "The brown fox wasn't that quick and he couldn't win the race" # default word tokenizer default_wt = nltk.word_tokenize words = default_wt(sentence) print(words)
import nltk import re import string from pprint import pprint
corpus = ["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for $199", "@@You'll (learn) a **lot** in the book. Python is an amazing language!@@"]
def tokenize_text(text): sentences = nltk.sent_tokenize(text) word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences] return word_tokens token_list = [tokenize_text(text) for text in corpus] print(token_list)
def remove_characters_after_tokenization(tokens): pattern = re.compile('[{}]'.format(re.escape(string.punctuation))) filtered_tokens = [pattern.sub('', token) for token in tokens] return filtered_tokens filtered_list_1 = [[remove_characters_after_tokenization(tokens) for tokens in sentence_tokens] for sentence_tokens in token_list] pprint(filtered_list_1)
def remove_characters_before_tokenization(sentence, keep_apostrophes=False): sentence = sentence.strip() if keep_apostrophes: PATTERN = r'[?|$|&|*|%|@|(|)|~]' filtered_sentence = re.sub(PATTERN, r'', sentence) else: PATTERN = r'[^a-zA-Z0-9 ]' filtered_sentence = re.sub(PATTERN, r'', sentence) return filtered_sentence filtered_list_2 = [remove_characters_before_tokenization(sentence) for sentence in corpus] print(filtered_list_2) cleaned_corpus = [remove_characters_before_tokenization(sentence, keep_apostrophes=True) for sentence in corpus] print(cleaned_corpus)
输出:
['The brown fox wasnt that quick and he couldnt win the race', 'Hey thats a great deal I just bought a phone for 199', 'Youll learn a lot in the book Python is an amazing language'] ["The brown fox wasn't that quick and he couldn't win the race", "Hey that's a great deal! I just bought a phone for 199", "You'll learn a lot in the book. Python is an amazing language!"]
['The brown fox was not that quick and he could not win the race', 'Hey that is a great deal! I just bought a phone for 199', 'You will learn a lot in the book. Python is an amazing language!']
2.6大小写转换
# case conversion print(corpus[0].lower()) print(corpus[0].upper())
输出:
the brown fox wasn't that quick and he couldn't win the race THE BROWN FOX WASN'T THAT QUICK AND HE COULDN'T WIN THE RACE
2.7删除停用词
# removing stopwords def remove_stopwords(tokens): stopword_list = nltk.corpus.stopwords.words('english') filtered_tokens = [token for token in tokens if token not in stopword_list] return filtered_tokens expanded_corpus_tokens = [tokenize_text(text) for text in expanded_corpus] filtered_list_3 = [[remove_stopwords(tokens) for tokens in sentence_tokens] for sentence_tokens in expanded_corpus_tokens] print(filtered_list_3)