scikit-learn and pandas library to try and differentiate between the albums of two of my favorite artists: Drake and Kanye West.¶import math, re
import glob, os
import pandas as pd
from collections import Counter
def tokenize(s):
"""
Input:
string s
Output:
list of strings
"""
return s.split()
def preprocess(s, lowercase=True, strip_punctuation=True):
"""
Input:
string s
boolean lowercase
boolean strip_punctuation
Return:
list of strings
"""
punctuation = '.,?<>:;"\'!%'
if isinstance(s, str):
s = tokenize(s)
if lowercase:
s = [t.lower() for t in s]
if strip_punctuation:
s = [t.strip(punctuation) for t in s]
return s
def token_frequency(tokens=None, tf={}, relative=False):
"""
Input:
tokens = list of strings or None
tf = dict or None
relative = boolean
Return:
dictionary of token frequencies
"""
for t in tokens:
if t in tf:
tf[t]+=1
else:
tf[t]=1
if relative:
total = sum([c for t, c in tf.items()])
tf = {t:tf[t]/total for t in tf}
return tf
path = '/Users/nurzhan.kanatzhanov/Desktop/SP2020/Web Portfolio/portfolio/txt/*.txt'
filenames = glob.glob(path)
TOP_N to 20 to learn a model on the 20 most frequent words in each artists' album and using them as features (columns) in a pandas DataFrame¶TOP_N = 20
tf = {}
for fn in filenames:
s = open(fn, 'r').read()
tf = token_frequency(preprocess(s), tf=tf)
top_f = sorted(tf.items(), key=lambda x:x[1], reverse=True)[:TOP_N]
features = [t[0] for t in top_f]
labels = [os.path.split(fn)[1][:-4].replace('_', ' ').title() for fn in filenames]
vectors = [token_frequency(preprocess(open(f, 'r').read()), tf={}, relative=True) for f in filenames]
vectors = [{key:v[key] for key in v if key in features} for v in vectors]
vectors_df = pd.DataFrame(vectors, index=labels, columns=features).fillna(0)
# remove truncation and adjust column width
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)
vectors_df
scikit-learn's KMeans to learn 2 clusters from the data¶from sklearn.cluster import KMeans
n_clusters = 2
kmeans = KMeans(n_clusters = n_clusters, random_state = 0).fit(vectors_df)
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
transformed = pca.fit_transform(vectors_df)
x = transformed[:,0]
y = transformed[:,1]
import matplotlib.pyplot as plt
from adjustText import adjust_text
col_dict = {0:'green', 1:'blue'}
cols = [col_dict[l] for l in kmeans.labels_]
plt.figure(figsize=(16,12))
plt.scatter(x,y, c=cols, s=100, alpha=.65)
texts = []
for i, l in enumerate(labels):
texts.append(plt.text(x[i],y[i], l, weight='bold'))
arrows = []
for i, c in enumerate(pca.components_.transpose()):
plt.arrow(0,0, c[0]/30, c[1]/30, alpha=.2, width=.0001, color="red")
arrows.append(plt.text(c[0]/30, c[1]/30, features[i]))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title("Drake's and Kanye West's albums in a space of {} most common features".format(TOP_N))
adjust_text(texts)
adjust_text(arrows)
plt.show()
gensim, an unsupervised topic modeling and language processing library.¶def get_texts(filenames, stop_words):
for fn in filenames:
text = open(fn, 'r').read()
text = [t for t in preprocess(text) if t not in stop_words]
yield(text)
NUM_TOPICS = 5
TOPN = 15
STOP = 100
freqs = {}
for file in filenames:
freqs = token_frequency(preprocess(open(file, 'r').read()), tf=freqs)
stop_words = sorted(freqs, key=freqs.__getitem__, reverse=True)[:STOP]
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(get_texts(filenames, stop_words))
corpus = [dictionary.doc2bow(text) for text in get_texts(filenames, stop_words)]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS)
corpus_lda = lda[corpus]
for topic in range(NUM_TOPICS):
tt = lda.get_topic_terms(topic, topn=TOPN)
top_words = [dictionary[t] for t, w in tt]
top_words = ', '.join(top_words)
print('Topic {:>2d}: {}'.format(topic, top_words))
for i, label in enumerate(labels):
topics = sorted(corpus_lda[i], key = lambda x:x[1], reverse=True)
topics = ['Topic {} ({:2.2f}%)'.format(t[0], t[1]*100) for t in topics]
topics = ', '.join(topics)
print('{}:\n{}\n'.format(label, topics))
gensim's similarities class that "computes similarities across a collection of documents in the Vector Space Model." This will connect the most similar albums between Drake and Kanye West.¶similarity_index = similarities.SparseMatrixSimilarity(corpus_lda, num_features=NUM_TOPICS)
print('Most similar texts:\n')
for i, label in enumerate(labels):
sim = similarity_index[corpus_lda[i]]
sim_labels = sorted(zip(sim, labels), reverse=True)
sim_print = [l for s, l in sim_labels][1:4]
sim_print = ', '.join(sim_print)
print('{}: {}\n'.format(label, sim_print))
import matplotlib.pyplot as plt
plt.figure(figsize=(15,10))
print(lda.get_topics()[0])
tokens, y = zip(*lda.get_topic_terms(1, topn=25))
tokens = [dictionary[t] for t in tokens]
x = list(range(25))
plt.bar(x,y, tick_label=tokens)
plt.xticks(rotation='vertical')
plt.xlabel('word')
plt.ylabel('frequency')
pandas DataFrame with the distribution of topics of the 17 albums¶topics = list(range(NUM_TOPICS))
vectors = [{index:ratio for index, ratio in v} for v in corpus_lda]
vectors_df = pd.DataFrame(vectors, index=labels, columns=topics).fillna(0)
n_clusters=2
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors_df)
kmeans.labels_
pca = PCA(n_components=2)
transformed = pca.fit_transform(vectors_df)
x = transformed[:,0]
y = transformed[:,1]
col_dict = {0:'green', 1:'blue'}
cols = [col_dict[l] for l in kmeans.labels_]
plt.figure(figsize=(16,12))
plt.scatter(x,y, c=cols, s=100, alpha=.65)
texts = []
for i, l in enumerate(labels):
texts.append(plt.text(x[i],y[i], l, weight='bold'))
arrows = []
for i, c in enumerate(pca.components_.transpose()):
plt.arrow(0,0, c[0]/2, c[1]/2, alpha=.3, width=.002, color="red")
arrows.append(plt.text(c[0]/2, c[1]/2, topics[i]))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title("Drake's and Kanye West's albums in a space of {} topics".format(NUM_TOPICS))
adjust_text(texts)
adjust_text(arrows)
plt.show()
path_acdc = '/Users/nurzhan.kanatzhanov/Desktop/SP2020/Web Portfolio/portfolio/txt/acdc/*.txt'
filenames_acdc = glob.glob(path_acdc)
filenames_acdc.extend(filenames)
NUM_TOPICS = 10
freqs = {}
for file in filenames_acdc:
freqs = token_frequency(preprocess(open(file, 'r').read()), tf=freqs)
stop_words = sorted(freqs, key=freqs.__getitem__, reverse=True)[:STOP]
from gensim import corpora, models, similarities
dictionary = corpora.Dictionary(get_texts(filenames_acdc, stop_words))
corpus = [dictionary.doc2bow(text) for text in get_texts(filenames_acdc, stop_words)]
lda = models.LdaModel(corpus, id2word=dictionary, num_topics=NUM_TOPICS)
corpus_lda = lda[corpus]
topics = list(range(NUM_TOPICS))
vectors = [{index:ratio for index, ratio in v} for v in corpus_lda]
labels = [os.path.split(fn)[1][:-4].replace('_', ' ').title() for fn in filenames_acdc]
vectors_df = pd.DataFrame(vectors, index=labels, columns=topics).fillna(0)
n_clusters=3
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(vectors_df)
pca = PCA(n_components=2)
transformed = pca.fit_transform(vectors_df)
x = transformed[:,0]
y = transformed[:,1]
col_dict = {0:'green', 1:'blue', 2:'yellow'}
cols = [col_dict[l] for l in kmeans.labels_]
plt.figure(figsize=(16,12))
plt.scatter(x,y, c=cols, s=100, alpha=.65)
texts = []
for i, l in enumerate(labels):
texts.append(plt.text(x[i],y[i], l, weight='bold'))
arrows = []
for i, c in enumerate(pca.components_.transpose()):
plt.arrow(0,0, c[0]/2, c[1]/2, alpha=.3, width=.002, color="red")
arrows.append(plt.text(c[0]/2, c[1]/2, topics[i]))
plt.xlabel('PCA1')
plt.ylabel('PCA2')
plt.title("Drake's, Kanye West's, and AC/DC's albums in a space of {} topics".format(NUM_TOPICS))
adjust_text(texts)
adjust_text(arrows)
plt.show()