Co-occurance marix#

[25]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

[26]:
strings = [
    "I like deep learning",
    "I like NLP",
    "I enjoy flying",
    "they enjoy flying",
    "I love eating cake",
    "I enjoy good cake",
    "I like coding",
    "they like coding"
]

p_str= '[a-zA-Z]{1,}'
pattern = re.compile(p_str, re.M)
groups = [re.findall(pattern,i) for i in strings]
groups

[26]:
[['I', 'like', 'deep', 'learning'],
 ['I', 'like', 'NLP'],
 ['I', 'enjoy', 'flying'],
 ['they', 'enjoy', 'flying'],
 ['I', 'love', 'eating', 'cake'],
 ['I', 'enjoy', 'good', 'cake'],
 ['I', 'like', 'coding'],
 ['they', 'like', 'coding']]
[27]:
bow = []
for i in groups:
    bow += i

bow = list(set(bow))
bow
[27]:
['deep',
 'cake',
 'good',
 'love',
 'like',
 'I',
 'flying',
 'NLP',
 'they',
 'enjoy',
 'eating',
 'learning',
 'coding']
[28]:
word_map = dict(enumerate(bow))
word_inv_map = { word_map[i]:i for i in word_map }
[29]:
word_map
[29]:
{0: 'deep',
 1: 'cake',
 2: 'good',
 3: 'love',
 4: 'like',
 5: 'I',
 6: 'flying',
 7: 'NLP',
 8: 'they',
 9: 'enjoy',
 10: 'eating',
 11: 'learning',
 12: 'coding'}

Context words association matrix#

One step words

[30]:
bigrams_maps = []
for row in groups:
    length = len(row)
    for i in range(length - 1):
        w1 = word_inv_map[row[i]]
        w2 = word_inv_map[row[i+1]]
        bigrams_maps.append([w1, w2])
        bigrams_maps.append([w2, w1])
bigrams_maps = np.array(bigrams_maps)

[31]:
mat = csr_matrix((np.ones((bigrams_maps.shape[0])) ,
           (bigrams_maps[..., 0], bigrams_maps[..., 1]))).toarray()

[32]:
words:list = list(word_map.values())
groups, words
[32]:
([['I', 'like', 'deep', 'learning'],
  ['I', 'like', 'NLP'],
  ['I', 'enjoy', 'flying'],
  ['they', 'enjoy', 'flying'],
  ['I', 'love', 'eating', 'cake'],
  ['I', 'enjoy', 'good', 'cake'],
  ['I', 'like', 'coding'],
  ['they', 'like', 'coding']],
 ['deep',
  'cake',
  'good',
  'love',
  'like',
  'I',
  'flying',
  'NLP',
  'they',
  'enjoy',
  'eating',
  'learning',
  'coding'])
[33]:
df = pd.DataFrame(mat, columns = words)
df.index = words
df

[33]:
deep cake good love like I flying NLP they enjoy eating learning coding
deep 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0
cake 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
good 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
love 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0
like 1.0 0.0 0.0 0.0 0.0 3.0 0.0 1.0 1.0 0.0 0.0 0.0 2.0
I 0.0 0.0 0.0 1.0 3.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0
flying 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0
NLP 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
they 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0
enjoy 0.0 0.0 1.0 0.0 0.0 2.0 2.0 0.0 1.0 0.0 0.0 0.0 0.0
eating 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
learning 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
coding 0.0 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

Plotting word vectors#

[34]:
from sklearn.decomposition import PCA
pc = PCA(n_components=2).fit_transform(mat)
[35]:
fig,ax = plt.subplots(1,1,figsize=(15,15))
for i in range(len(pc)):
    ax.scatter(pc[i,0],pc[i,1],marker='.')
    ax.text(pc[i,0],pc[i,1],words[i])

plt.grid()
plt.show()
../_images/Concepts_word_association_13_0.png
[36]:
from scipy.spatial.distance import cosine
[37]:
def cosine_distance(word1, word2):
    return cosine(df.loc[word1].values, df.loc[word2].values)

[38]:
cosine_distance('like','enjoy')
[38]:
0.4466014094705336
[39]:
cosine_distance('like', 'love')
[39]:
0.4696699141100893
[40]:
cosine_distance('enjoy', 'love')
[40]:
0.5527864045000421
[41]:
cosine_distance('like', 'learning')
[41]:
0.75
[42]:
cosine_distance('flying', 'learning')
[42]:
1.0
[43]:
cosine_distance('I', 'they')

[43]:
0.05508881747693195