import nltk.tokenize as tk doc = "Are you curious about tokenization? " \ "Let's see how it works! " \ "We need to analyze a couple of sentences " \ "with punctuations to see it in action." print(doc) tokens = tk.sent_tokenize(doc) for i, token inenumerate(tokens): print('%2d' % (i + 1), token) print('-' * 15) tokens = tk.word_tokenize(doc) for i, token inenumerate(tokens): print('%2d' % (i + 1), token) print('-' * 15) tokenizer = tk.WordPunctTokenizer() tokens = tokenizer.tokenize(doc) for i, token inenumerate(tokens): print('%2d' % (i + 1), token)
import nltk.stem.porter as pt import nltk.stem.lancaster as lc import nltk.stem.snowball as sb words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision'] pt_stemmer = pt.PorterStemmer() lc_stemmer = lc.LancasterStemmer() sb_stemmer = sb.SnowballStemmer('english') for word in words: pt_stem = pt_stemmer.stem(word) lc_stem = lc_stemmer.stem(word) sb_stem = sb_stemmer.stem(word) print("%8s %8s %8s %8s" % (word, pt_stem, lc_stem, sb_stem))
词型还原
名词:变成单数 动词:动词原型
1 2 3 4 5 6 7 8 9 10
import nltk.stem as ns words = ['table', 'probably', 'wolves', 'playing', 'is', 'dog', 'the', 'beaches', 'grounded', 'dreamt', 'envision'] lemmatizer = ns.WordNetLemmatizer() for word in words: n_lema = lemmatizer.lemmatize(word, pos='n') v_lema = lemmatizer.lemmatize(word, pos='v') print("%8s %8s %8s" % (word, n_lema, v_lema))
词袋模型
词表:包含段落中不同单词的个数。 [1]The brown dog is running. [2]The black dog is in the black room. [3]Running in the room is forbidden. the brown dog is running black in room forbidden black brown dog forbidden in is room running the [1] 0 1 1 0 0 1 0 1 1 [2] 2 0 1 0 1 1 1 0 2 [3] 0 0 0 1 1 1 1 1 1
1 2 3 4 5 6 7 8 9 10 11 12 13 14
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft doc = 'The brown dog is running. ' \ 'The black dog is in the black room. ' \ 'Running in the room is forbidden.' print(doc) setences = tk.sent_tokenize(doc) print(setences) # 计数矢量化器 cv = ft.CountVectorizer() bow = cv.fit_transform(setences).toarray() words = cv.get_feature_names() print(words) print(bow)
词频
对词袋矩阵做归一化,用词表中的每个单词在每个样本中出现的频率,表示该单词对具体语句语义的价值。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft import sklearn.preprocessing as sp doc = 'The brown dog is running. ' \ 'The black dog is in the black room. ' \ 'Running in the room is forbidden.' print(doc) setences = tk.sent_tokenize(doc) print(setences) # 计数矢量化器 cv = ft.CountVectorizer() bow = cv.fit_transform(setences).toarray() words = cv.get_feature_names() print(words) print(bow) tf = sp.normalize(bow, norm='l1'); print(tf)
$$逆文档频率 = \frac{样本总数}{包含某个特定单词的样本数}$$
词频逆文档频率:TF-IDF,自然语言的数学模型
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
import nltk.tokenize as tk import sklearn.feature_extraction.text as ft doc = 'The brown dog is running. ' \ 'The black dog is in the black room. ' \ 'Running in the room is forbidden.' print(doc) setences = tk.sent_tokenize(doc) print(setences) # 计数矢量化器 cv = ft.CountVectorizer() bow = cv.fit_transform(setences).toarray() words = cv.get_feature_names() print(words) print(bow) # TF-IDF转换器 tt = ft.TfidfTransformer() tfidf = tt.fit_transform(bow).toarray() print(tfidf)
import sklearn.datasets as sd import sklearn.feature_extraction.text as ft import sklearn.naive_bayes as nb train = sd.load_files('../data/20news', encoding='latin1', shuffle=True, random_state=7) train_data = train.data train_y = train.target categories = train.target_names cv = ft.CountVectorizer() train_bow = cv.fit_transform(train_data) tt = ft.TfidfTransformer() # TF-IDF train_x = tt.fit_transform(train_bow); # 基于多项分布的朴素贝叶斯分类器 model = nb.MultinomialNB() model.fit(train_x, train_y) test_data = [ 'The curveballs of right handed pitchers tend to curve to the left', 'Caesar cipher is an ancient form of encryption', 'This two-wheeler is realy good on slippery roads'] test_bow = cv.transform(test_data) test_x = tt.transform(test_bow) pred_test_y = model.predict(test_x) for sentence, index inzip( test_data, pred_test_y): print(sentence, '->', categories[index])
import nltk.corpus as nc import nltk.classify as cf import nltk.classify.util as cu pdata = [] fileids = nc.movie_reviews.fileids('pos') for fileid in fileids: feature = {} words = nc.movie_reviews.words(fileid) for word in words: feature[word] = True; pdata.append((feature, 'POSITIVE')) ndata = [] fileids = nc.movie_reviews.fileids('neg') for fileid in fileids: feature = {} words = nc.movie_reviews.words(fileid) for word in words: feature[word] = True; ndata.append((feature, 'NEGATIVE')) pnumb, nnumb = int(0.8 * len(pdata)), \ int(0.8 * len(ndata)) train_data = pdata[:pnumb] + ndata[:nnumb] test_data = pdata[pnumb:] + ndata[nnumb:] model = cf.NaiveBayesClassifier.train( train_data) ac = cu.accuracy(model, test_data) reviews = [ 'It is an amazing movie.', 'This is a dull movie. I would never recommend it to anyone.', 'The cinematography is pretty great in this movie.', 'The direction was terrible and the story was all over the place.'] sents, probs = [], [] for review in reviews: feature = {} words = review.split(' ') for word in words: feature[word] = True pcls = model.prob_classify(feature) sent = pcls.max() prob = pcls.prob(sent) sents.append(sent) probs.append(prob) for review, sent, prob inzip( reviews, sents, probs): print(review, '->', sent, prob)
主题抽取
基于LDA,隐狄利克雷分布
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
import warnings warnings.filterwarnings( 'ignore', category=UserWarning) import nltk.tokenize as tk import nltk.corpus as nc import nltk.stem.snowball as sb import gensim.models.ldamodel as gm import gensim.corpora as gc doc = [] withopen('../data/topic.txt', 'r') as f: for line in f.readlines(): doc.append(line[:-1]) tokenizer = tk.RegexpTokenizer(r'\w+') stopwords = nc.stopwords.words('english') stemmer = sb.SnowballStemmer('english')