In [34]:
NUM_WORKERS = 1
BATCH_SIZE = 2
EPOCHS = 2
LEARNING_RATE = 0.0015
EMBED_DIM = 300
WINDOW = 2

In [35]:
import torch
import torch.nn as nn

In [36]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device} for computation")

Using cuda for computation


# Environment
We are running this on the SCC using the `python3/3.10.12` and `pytorch/1.13.1` modules. 

# Corpus for training
"The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia."
https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/

This is available to use from the Torchtext package.

The `pytorch/1.13.1` module comes with a version of `torchtext`, but apparently this requires `torchdata` to run. According to:<br>
https://github.com/pytorch/data#installation<br>
for PyTorch 1.13.1 we should use Torchdata 0.5.1

`pip install --no-cache-dir --prefix=/projectnb/jbrcs/word2vec/packages torchdata==0.5.1`

In [37]:
from torch.utils.data import DataLoader
from torchtext.datasets import WikiText103

train, valid, test = WikiText103(root='./data', split= ('train', 'valid', 'test'))

In [38]:
valid

ShardingFilterIterDataPipe

This is a "DataPipe" which allows us to stream in data efficiently to our program. We can make an iterator using the DataPipe if we want:

In [39]:
myiter = iter(valid)
print(repr(next(myiter)))
print(repr(next(myiter)))
print(repr(next(myiter)))
print(repr(next(myiter)))

' \n'
' = Homarus gammarus = \n'
' \n'
' Homarus gammarus , known as the European lobster or common lobster , is a species of clawed lobster from the eastern Atlantic Ocean , Mediterranean Sea and parts of the Black Sea . It is closely related to the American lobster , H. americanus . It may grow to a length of 60 cm ( 24 in ) and a mass of 6 kilograms ( 13 lb ) , and bears a conspicuous pair of claws . In life , the lobsters are blue , only becoming " lobster red " on cooking . Mating occurs in the summer , producing eggs which are carried by the females for up to a year before hatching into planktonic larvae . Homarus gammarus is a highly esteemed food , and is widely caught using lobster pots , mostly around the British Isles . \n'


We want to build a "vocabulary" of all the words in our corpus. To do this we need to "tokenize" the corpus into individual words. We could write our own simple tokenizer by splitting on spaces, etc; instead we will use one included with Torchtext: <br>
https://pytorch.org/text/stable/data_utils.html#get-tokenizer

Then we build our vocabulary, assigning an index to each word: <br>
https://pytorch.org/text/stable/vocab.html#build-vocab-from-iterator

In [40]:
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

# This tokenizer converts words to token ID numbers
tokenizer = get_tokenizer('basic_english')
train, valid, test = WikiText103(root='./data', split= ('train', 'valid', 'test'))

print("Preprocessing")
remove_titles = lambda x: x[:2]!=' ='
remove_short_sequences = lambda x: len(x)>2*WINDOW
data = valid.filter(remove_titles) # Change x.filter to split desired
data = data.map(tokenizer)
data = data.filter(remove_short_sequences)

vocab = build_vocab_from_iterator(data, min_freq=1, specials=["<unk>"])
VOCAB_SIZE = len(vocab)
data = data.map(vocab.lookup_indices)

Preprocessing


In [41]:
print(f"Number of words in vocab: {VOCAB_SIZE}")

# Example of getting ID using vocab
print(
    vocab_valid['cool'])

# We can tokenize a whole sentence like so:
vocab_valid.lookup_indices(tokenizer('Josh is a cool guy'))

Number of words in vocab: 16502
4819


[13028, 24, 8, 4819, 3331]

In [42]:
# Using a dataloader allows us to efficiently work with large amounts of data
print("Creating dataloader")
loader = DataLoader(
        dataset=data,
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        num_workers=NUM_WORKERS,
        collate_fn=lambda x: x) #This collate simply passes data through

Creating dataloader


In [43]:
def make_context(tokens, window):
    # This gives the context words within the window around the target word. For example:
    # the quick BROWN fox jumps
    # If the target word is "brown". For a window size of 2 the context words are:
    # "the, quick, fox, jumps"
    
    # The first/last "window" amount of words have their left/right context window instersect with the start/end
    # We still want 2*window words for our context, so we take the next closest words by
    # padding the start and end of a document with a "window" amount of words. For example:
    # fox jumps | the quick brown fox jumps
    # Where the "|" indicates the end of the padded portion. So if our window size is still 2 and the target
    # is "quick" then we only have one word in the left window in th original sequence, but with the padded one:
    # fox jumps | the QUICK brown fox jumps
    # Our left window now extends into the padded region. The effect is as if our left window size is one and
    # the right window size is three, but it makes it easier to code.
    padded = (tokens[window+1 : 2*window+1]
              + tokens
              + tokens[-(2*window+1) : -(window+1)])

    data = []
    for i in range(window, len(padded) - window):
        context = padded[i-window : i] + padded[i+1 : i+1+window]
        data.append(context)

    return torch.tensor(tokens), torch.tensor(data)

In [44]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)        

    def forward(self, inputs):
        embeds = torch.sum(self.embeddings(inputs), dim=1)
        out = self.linear(embeds)
        return out

In [45]:
for idx, batch in enumerate(loader_valid):
    for sample in batch:
        print(sample)
        break
    break

[1763, 0, 20, 248, 165, 2, 2593, 50, 267, 665, 2, 408, 21, 9, 30, 122, 542, 986, 1365, 2682, 3, 27, 9, 8, 585, 4, 1, 343, 1968, 240, 20, 0, 21, 5, 1, 1308, 10665, 1101, 2, 5, 27, 385, 11, 454, 107, 675, 19, 12386, 8642, 3, 0, 1043, 19, 2076, 10624, 2, 13040, 10994, 2, 8996, 7520, 2, 5476, 5000, 2, 6845, 3031, 2, 2514, 13833, 2, 13262, 14913, 2, 13224, 3929, 2, 8528, 15260, 2, 3790, 2152, 2, 5, 1, 10676, 3]


In [46]:
print("Sending model")
model = CBOW(len(vocab), EMBED_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

Sending model


In [48]:
print("Begin training")
for epoch in range(EPOCHS):
    for idx, batch in enumerate(loader):
        targets = torch.tensor(batch[0]).to(device)
        context = make_context(batch[0], WINDOW).to(device)
        out = model(context)
        loss = criterion(out, targets)
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if idx%200==0:
            print(idx, loss.item())

Begin training


AttributeError: 'tuple' object has no attribute 'to'

In [None]:
#TESTING
# widely caught using lobster pots
#Atlantic Ocean , Mediterranean Sea and
#widely caught lobster pots
context = 'Atlantic Ocean Sea and'
context_vector = v1.lookup_indices(tokenizer(context))
a = model(torch.tensor(context_vector, dtype=torch.long).to(device))
val,ind = torch.topk(a,10)
v1.lookup_tokens(ind.tolist()[0])

In [None]:
def mysim(a,b):
    aa = model.embeddings(torch.tensor(v1[a], dtype=torch.long).to(device))
    bb = model.embeddings(torch.tensor(v1[b], dtype=torch.long).to(device))
    sim = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
    return sim(aa,bb).item()

mysim('lobster','josh')
    

# TODO
* Create  a counter from collections to get word frequency
* Use the counter to create a vocab
* Use the word frequency for subsampling to improve performance (use subsampling distribution from word2vec paper) by sampling common words less frequently