In [None]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups # https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [None]:
#dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
dataset = fetch_20newsgroups(random_state=1)

documents = dataset.data
targets = dataset.target
print('#samples :',len(documents))
print('#samples :',len(targets))

#samples : 11314
#samples : 11314


In [None]:
target_df = pd.DataFrame({'target': targets})
print(dataset.target_names)
target_df.head()

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


Unnamed: 0,target
0,17
1,0
2,17
3,11
4,10


In [None]:
news_df = pd.DataFrame({'document':documents})
# special character removal
news_df['clean_doc'] = news_df['document'].str.replace("[^a-zA-Z]", " ")
# short word removal
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
# lowercase
news_df['clean_doc'] = news_df['clean_doc'].apply(lambda x: x.lower())

  This is separate from the ipykernel package so we can avoid doing imports until


In [None]:
news_df.head()

Unnamed: 0,document,clean_doc
0,"From: ab4z@Virginia.EDU (""Andi Beyer"")\nSubjec...",from virginia andi beyer subject israeli terro...
1,From: timmbake@mcl.ucsb.edu (Bake Timmons)\nSu...,from timmbake ucsb bake timmons subject amusin...
2,From: bc744@cleveland.Freenet.Edu (Mark Ira Ka...,from cleveland freenet mark kaufman subject re...
3,From: ray@ole.cdac.com (Ray Berry)\nSubject: C...,from cdac berry subject clipper business usual...
4,From: kkeller@mail.sas.upenn.edu (Keith Keller...,from kkeller mail upenn keith keller subject p...


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
stop_words = stopwords.words('english')
tokenized_doc = news_df['clean_doc'].apply(lambda x: x.split()) # tokenization
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

In [None]:
print(tokenized_doc[1])


['timmbake', 'ucsb', 'bake', 'timmons', 'subject', 'amusing', 'atheists', 'agnostics', 'lines', 'james', 'hogan', 'writes', 'timmbake', 'ucsb', 'bake', 'timmons', 'writes', 'hogan', 'quips', 'summary', 'stuff', 'afraid', 'missed', 'point', 'thus', 'think', 'admit', 'atheists', 'sleeve', 'might', 'suspected', 'encourage', 'people', 'learn', 'atheism', 'little', 'atheists', 'sleeves', 'whatever', 'might', 'suspected', 'actually', 'quite', 'meager', 'want', 'send', 'address', 'learn', 'less', 'faith', 'faith', 'yeah', 'expect', 'people', 'read', 'actually', 'accept', 'hard', 'atheism', 'need', 'little', 'leap', 'faith', 'jimmy', 'logic', 'runs', 'steam', 'fine', 'people', 'shoot', 'foot', 'mock', 'idea', 'hope', 'understand', 'understand', 'thank', 'providing', 'healthy', 'sarcasm', 'would', 'dispelled', 'sympathies', 'would', 'faith', 'bake', 'real', 'glad', 'detected', 'sarcasm', 'angle', 'really', 'bummin', 'getting', 'sympathy', 'still', 'inclined', 'sympathy', 'somebody', 'faith', 'm

In [None]:
detokenized_doc = []
for i in range(len(news_df)):
    t = ' '.join(tokenized_doc[i])
    detokenized_doc.append(t)

news_df['clean_doc'] = detokenized_doc

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_features= 1000, # keep top 1000 words
max_df = 0.5, smooth_idf=True)

X = vectorizer.fit_transform(news_df['clean_doc'])

print('TF-IDF size :',X.shape)

TF-IDF size : (11314, 1000)


In [None]:
svd_model = TruncatedSVD(n_components=20, algorithm='randomized', n_iter=100, random_state=122) # set the # components
svd_model.fit(X)
len(svd_model.components_)

20

In [None]:
import numpy as np

In [None]:
np.shape(svd_model.components_)

(20, 1000)

In [None]:
terms = vectorizer.get_feature_names()  

def get_topics(components, feature_names, n=5):
    for idx, topic in enumerate(components):
        print("Topic %d:" % (idx+1), [(feature_names[i], topic[i].round(5)) for i in topic.argsort()[:-n - 1:-1]])
get_topics(svd_model.components_,terms)

Topic 1: [('article', 0.18071), ('university', 0.16193), ('posting', 0.15909), ('like', 0.15505), ('host', 0.15301)]
Topic 2: [('windows', 0.33927), ('thanks', 0.17862), ('card', 0.15959), ('host', 0.13829), ('nntp', 0.13518)]
Topic 3: [('team', 0.26083), ('game', 0.24599), ('nntp', 0.19181), ('host', 0.18946), ('posting', 0.18686)]
Topic 4: [('nasa', 0.33948), ('space', 0.21293), ('posting', 0.19156), ('nntp', 0.18929), ('host', 0.18735)]
Topic 5: [('cwru', 0.18295), ('posting', 0.17018), ('host', 0.16925), ('nntp', 0.16883), ('cleveland', 0.16405)]
Topic 6: [('nasa', 0.40828), ('windows', 0.28775), ('space', 0.28202), ('window', 0.12588), ('file', 0.1258)]
Topic 7: [('pitt', 0.3883), ('gordon', 0.33401), ('banks', 0.3164), ('drive', 0.26518), ('nasa', 0.21457)]
Topic 8: [('pitt', 0.40024), ('gordon', 0.32417), ('banks', 0.31587), ('pittsburgh', 0.13856), ('clipper', 0.12806)]
Topic 9: [('state', 0.30525), ('ohio', 0.29765), ('israel', 0.24294), ('cleveland', 0.19818), ('windows', 0.1



For each following case, 


*   Identify important features (5)
*   Draw AUC-ROC curve (5)
*   Interpret results (5)


Cases:
1. Use logistic regression to regress news categories on (latent) components
2. Use logistic regression to regress senders' affiliation on (latent) components
* academia vs. industry (you can identify it by email address domain name)
3. Use logistic regression to regress receivers' affiliation on (latent) components
4. Use decision tree to do the same job as cases 1,2,3 (draw decision trees) (extra 10)
5. Change the #top words to include and #components; repeat 1,2,3,4. (extra 10)

1. Use logistic regression to regress news categories on (latent) components

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import pandas as pd
import sklearn.datasets
categories = [
    'alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc'
]
train_news = sklearn.datasets.fetch_20newsgroups(subset='train',  categories=categories, random_state=42)
X_train = train_news.data
y_train = train_news.target

test_news = sklearn.datasets.fetch_20newsgroups(subset='test',  categories=categories, random_state=42)
X_test = test_news.data
y_test = test_news.target

In [None]:
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)

In [None]:
lr_clf = LogisticRegression(solver='lbfgs', max_iter=50)
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('Accuracy {0:.3f}'.format(accuracy_score(y_test, pred)))

Accuracy 0.776


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


2. Use logistic regression to regress senders' affiliation on (latent) components

In [None]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all', random_state=156)

In [None]:
print(news_data.data[11121]) ## .edu == academia
print(news_data.filenames[11121])

From: cac@owlnet.rice.edu (Christopher Andrew Campbell)
Subject: Re: Royals
Summary: never
Organization: Rice University
Distribution: na
Lines: 12

In article <spork.735077099@camelot> spork@camelot.bradley.edu (Richard Izzo) writes:
        B.S. about darkness deleted.
>	Oh, lighten up.  What depresses me is that they might actually 
>finish last, which I believe hasn't happened since their second season in 
>1970.
	nope The Royals are the only team in the majors that have not
     finished in last place.    ^^^^    Of course this doesn't include 
     the marlins and the rockies but they have a good chance at 
     finishing last also.
>rich.



/root/scikit_learn_data/20news_home/20news-bydate-train/rec.sport.baseball/104445


In [None]:
print('size of Training data {0}, size of Test data {1}'.format(len(train_news.data), len(test_news.data)))

size of Training data 11314, size of Test data 7532


In [None]:
senders_academia_target = [0 for _ in range(11314 + 7532)]
for i in range(11314 + 7532):
  email_data = news_data.data[i].split("Subject:")
  if email_data[0].find(".edu") != -1:
    senders_academia_target[i] = 1
print(senders_academia_target)

[0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 

In [None]:
X_train = news_data.data[:11314]
y_train = senders_academia_target[:11314]
X_test = news_data.data[11314:]
y_test = senders_academia_target[11314:]
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)
lr_clf = LogisticRegression(solver='lbfgs', max_iter=50)
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('senders affiliation Accuracy {0:.3f}'.format(accuracy_score(y_test, pred)))

Accuracy 0.870


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


3. Use logistic regression to regress receivers' affiliation on (latent) components

In [None]:
receiver_academia_target = [0 for _ in range(11314 + 7532)]
for i in range(11314 + 7532):
  email_data = news_data.data[i].split("Subject:")
  if email_data[1].find(".edu") != -1:
    receiver_academia_target[i] = 1
print(receiver_academia_target)

[0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 

In [None]:
X_train = news_data.data[:11314]
y_train = receiver_academia_target[:11314]
X_test = news_data.data[11314:]
y_test = receiver_academia_target[11314:]
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)
X_test_cnt_vect = cnt_vect.transform(X_test)
lr_clf = LogisticRegression(solver='lbfgs', max_iter=50)
lr_clf.fit(X_train_cnt_vect, y_train)
pred = lr_clf.predict(X_test_cnt_vect)
print('receiver affiliation Accuracy {0:.3f}'.format(accuracy_score(y_test, pred)))

receiver affiliation Accuracy 0.943


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
