## Imports

In [None]:
import numpy as np

In [None]:
from gensim.models import Word2Vec, FastText
import pandas as pd

In [None]:
from gensim.test.utils import common_texts
import gensim.downloader
print(list(gensim.downloader.info()['models'].keys()))

## Dowload/load model

In [None]:
print(gensim.downloader.load('glove-twitter-25', return_path=True))

In [None]:
gv = gensim.downloader.load('glove-twitter-25')

In [None]:
gv.most_similar('twitter',topn=5)

## Calculate Jaccard score

In [None]:
def intersection_volume(A1, A2, B1, B2):
    # Calculate the minimum and maximum coordinates for the intersection rectangle
    C1 = np.maximum(A1, B1)
    C2 = np.minimum(A2, B2)
    # Calculate the lengths of each dimension of the intersection rectangle
    lengths = np.maximum(C2 - C1, 0)
    # Calculate the volume of the intersection rectangle
    volume = np.prod(lengths)
    return volume

In [None]:
A1 = np.array([-0.5, -0.5, -0.5])  # minimum coordinate of rectangle A along 3 dimensions
A2 = np.array([0.5, 0.5, 0.5])  # maximum coordinate of rectangle A along 3 dimensions
B1 = np.array([0, 0, 0])  # minimum coordinate of rectangle B along 3 dimensions
B2 = np.array([1, 1, 1])  # maximum coordinate of rectangle B along 3 dimensions

intersection_volume(A1, A2, B1, B2)  # Output: 0.125

In [None]:
def volume(A1, A2):
    lengths = np.maximum(A2 - A1, 0)
    volume = np.prod(lengths)
    return volume

In [None]:
def Jaccard(A1, A2, B1, B2):
    union_volume = volume(A1, A2) + volume(B1, B2) \
                    - intersection_volume(A1, A2, B1, B2)
    return intersection_volume(A1, A2, B1, B2)/union_volume

In [None]:
Jaccard(A1,A2,B1,B2)

## Calc Jaccard for two sets of words

In [None]:
s1 = gv['i','love','to','go','to','the','water','and','sit','on','the','bank']
s2 = gv['i','will','go','to','the','bank','and','make','a','deposit']
s3 = gv['we','went','to','the','river','and','swam']

In [None]:
s1x = np.max(s1,0)
s1n = np.min(s1,0)
s2x = np.max(s2,0)
s2n = np.min(s2,0)
s3x = np.max(s3,0)
s3n = np.min(s3,0)

In [None]:
s1[:,0]

In [None]:
s1x

In [None]:
Jaccard(s1n,s1x,s2n,s2x)

In [None]:
Jaccard(s1n,s1x,s3n,s3x)

In [None]:
Jaccard(s3n,s3x,s2n,s2x)

## Training from scratch
Example of training Word2Vec model from corpus:

In [None]:
sentences = [['i', 'like', 'apple', 'pie', 'for', 'dessert'],
           ['i', 'dont', 'drive', 'fast', 'cars'],
           ['data', 'science', 'is', 'fun'],
           ['chocolate', 'is', 'my', 'favorite'],
           ['my', 'favorite', 'movie', 'is', 'predator']]

In [None]:
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [None]:
print(model.wv['chocolate'])  # get numpy vector of a word