from sklearn.datasets import fetch_20newsgroups twenty_train = fetch_20newsgroups(subset='train', shuffle=True)

twenty_train.target_names #prints all the categories print("n".join(twenty_train.data[0].split("n")[:3])) #prints first line of the first data file

From: lerxst@wam.umd.edu (where's my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu

from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) X_train_counts.shape

(11314, 130107)

from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_train_tfidf.shape

(11314, 130107)

from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

from sklearn.pipeline import Pipeline text_clf = Pipeline( [ ("vect", CountVectorizer()), ("tfidf", TfidfTransformer()), ("clf", MultinomialNB()), ] ) text_clf = text_clf.fit(twenty_train.data, twenty_train.target)

import numpy as np twenty_test = fetch_20newsgroups(subset='test', shuffle=True) predicted = text_clf.predict(twenty_test.data) print(str(np.mean(predicted == twenty_test.target)*100) + ' % Accuracy')

77.38980350504514 % Accuracy

from sklearn.linear_model import SGDClassifier text_clf_svm = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42)), ]) _ = text_clf_svm.fit(twenty_train.data, twenty_train.target) predicted_svm = text_clf_svm.predict(twenty_test.data) print(str(np.mean(predicted == twenty_test.target)*100) + ' % Accuracy')

77.38980350504514 % Accuracy

from sklearn.model_selection import GridSearchCV parameters = { 'vect__ngram_range': [(1, 1), (1, 2)], 'tfidf__use_idf': (True, False), 'clf__alpha': (1e-2, 1e-3), } gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1) gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target) gs_clf.best_params_ gs_clf.best_score_ print(str(gs_clf.best_score_) + ' % Accuracy')

0.9157684864695698 % Accuracy

Scikit Learn¶

1. Loading the dataset¶

2. Extracting features from text files¶

Finding TF-IDF¶

3. Running ML algorithms¶

Performance of NB classifier¶

5. Grid Search¶

🎊🎊🎊 A accuracay of 90% Much better!¶