Co-occurance marix

Contents

Co-occurance marix#

[25]:

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

[26]:

strings = [
    "I like deep learning",
    "I like NLP",
    "I enjoy flying",
    "they enjoy flying",
    "I love eating cake",
    "I enjoy good cake",
    "I like coding",
    "they like coding"
]

p_str= '[a-zA-Z]{1,}'
pattern = re.compile(p_str, re.M)
groups = [re.findall(pattern,i) for i in strings]
groups

[26]:

[['I', 'like', 'deep', 'learning'],
 ['I', 'like', 'NLP'],
 ['I', 'enjoy', 'flying'],
 ['they', 'enjoy', 'flying'],
 ['I', 'love', 'eating', 'cake'],
 ['I', 'enjoy', 'good', 'cake'],
 ['I', 'like', 'coding'],
 ['they', 'like', 'coding']]

[27]:

bow = []
for i in groups:
    bow += i

bow = list(set(bow))
bow

[27]:

['deep',
 'cake',
 'good',
 'love',
 'like',
 'I',
 'flying',
 'NLP',
 'they',
 'enjoy',
 'eating',
 'learning',
 'coding']

[28]:

word_map = dict(enumerate(bow))
word_inv_map = { word_map[i]:i for i in word_map }

[29]:

word_map

[29]:

{0: 'deep',
 1: 'cake',
 2: 'good',
 3: 'love',
 4: 'like',
 5: 'I',
 6: 'flying',
 7: 'NLP',
 8: 'they',
 9: 'enjoy',
 10: 'eating',
 11: 'learning',
 12: 'coding'}

Context words association matrix#

One step words

[30]:

bigrams_maps = []
for row in groups:
    length = len(row)
    for i in range(length - 1):
        w1 = word_inv_map[row[i]]
        w2 = word_inv_map[row[i+1]]
        bigrams_maps.append([w1, w2])
        bigrams_maps.append([w2, w1])
bigrams_maps = np.array(bigrams_maps)

[31]:

mat = csr_matrix((np.ones((bigrams_maps.shape[0])) ,
           (bigrams_maps[..., 0], bigrams_maps[..., 1]))).toarray()

[32]:

words:list = list(word_map.values())
groups, words

[32]:

([['I', 'like', 'deep', 'learning'],
  ['I', 'like', 'NLP'],
  ['I', 'enjoy', 'flying'],
  ['they', 'enjoy', 'flying'],
  ['I', 'love', 'eating', 'cake'],
  ['I', 'enjoy', 'good', 'cake'],
  ['I', 'like', 'coding'],
  ['they', 'like', 'coding']],
 ['deep',
  'cake',
  'good',
  'love',
  'like',
  'I',
  'flying',
  'NLP',
  'they',
  'enjoy',
  'eating',
  'learning',
  'coding'])

[33]:

df = pd.DataFrame(mat, columns = words)
df.index = words
df

[33]:

	deep	cake	good	love	like	I	flying	NLP	they	enjoy	eating	learning	coding
deep	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0
cake	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
good	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
love	0.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0
like	1.0	0.0	0.0	0.0	0.0	3.0	0.0	1.0	1.0	0.0	0.0	0.0	2.0
I	0.0	0.0	0.0	1.0	3.0	0.0	0.0	0.0	0.0	2.0	0.0	0.0	0.0
flying	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	2.0	0.0	0.0	0.0
NLP	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
they	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	1.0	0.0	0.0	0.0
enjoy	0.0	0.0	1.0	0.0	0.0	2.0	2.0	0.0	1.0	0.0	0.0	0.0	0.0
eating	0.0	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
learning	1.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
coding	0.0	0.0	0.0	0.0	2.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

Plotting word vectors#

[34]:

from sklearn.decomposition import PCA
pc = PCA(n_components=2).fit_transform(mat)

[35]:

fig,ax = plt.subplots(1,1,figsize=(15,15))
for i in range(len(pc)):
    ax.scatter(pc[i,0],pc[i,1],marker='.')
    ax.text(pc[i,0],pc[i,1],words[i])

plt.grid()
plt.show()

../_images/Concepts_word_association_13_0.png

[36]:

from scipy.spatial.distance import cosine

[37]:

def cosine_distance(word1, word2):
    return cosine(df.loc[word1].values, df.loc[word2].values)

[38]:

cosine_distance('like','enjoy')

[38]:

0.4466014094705336

[39]:

cosine_distance('like', 'love')

[39]:

0.4696699141100893

[40]:

cosine_distance('enjoy', 'love')

[40]:

0.5527864045000421

[41]:

cosine_distance('like', 'learning')

[41]:

0.75

[42]:

cosine_distance('flying', 'learning')

[42]:

1.0

[43]:

cosine_distance('I', 'they')

[43]:

0.05508881747693195