1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
| import pandas as pd import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.model_selection import train_test_split from sklearn.naive_bayes import MultinomialNB
data = pd.read_csv('spam_ham_dataset.csv')
x = data[['text']] y = data[['label']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
print('train data number: {d}'.format(d=len(x_train))) print('test data number: {d}'.format(d=len(x_test)))
cv = CountVectorizer() cv.fit(x['text']) count = cv.transform(x_train['text'])
tfidf = TfidfTransformer() tfidf.fit(count) tfidf_matrix = tfidf.transform(count)
model = MultinomialNB() model.fit(tfidf_matrix, y_train)
model.predict(tfidf.transform(cv.transform(x_test['text'])))
model.score(tfidf.transform(cv.transform(x_test['text'])), y_test)
|