In [None]:
NUM_WORKERS = 1
BATCH_SIZE = 1
EPOCHS = 2
LEARNING_RATE = 0.0015
EMBED_DIM = 300
WINDOW = 2

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchtext.datasets import WikiText103
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

In [None]:
# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using {device} for computation")

# Environment
We are running this on the SCC using the `python3/3.10.12` and `pytorch/1.13.1` modules. 

# Corpus for training
"The WikiText language modeling dataset is a collection of over 100 million tokens extracted from the set of verified Good and Featured articles on Wikipedia."
https://blog.salesforceairesearch.com/the-wikitext-long-term-dependency-language-modeling-dataset/

This is available to use from the Torchtext package.

The `pytorch/1.13.1` module comes with a version of `torchtext`, but apparently this requires `torchdata` to run. According to:<br>
https://github.com/pytorch/data#installation<br>
for PyTorch 1.13.1 we should use Torchdata 0.5.1

`pip install --no-cache-dir --prefix=/projectnb/jbrcs/word2vec/packages torchdata==0.5.1`

In [None]:
train, valid, test = WikiText103(root='/project/scv/examples/machine_learning/tutorials/LLM/data', split= ('train', 'valid', 'test'))

In [None]:
valid

This is a "DataPipe" which allows us to stream in data efficiently to our program. We can make an iterator using the DataPipe if we want:

In [None]:
myiter = iter(valid)
print(repr(next(myiter)))
print(repr(next(myiter)))
print(repr(next(myiter)))
print(repr(next(myiter)))

We want to build a "vocabulary" of all the words in our corpus. To do this we need to "tokenize" the corpus into individual words. We could write our own simple tokenizer by splitting on spaces, etc; instead we will use one included with Torchtext: <br>
https://pytorch.org/text/stable/data_utils.html#get-tokenizer

Then we build our vocabulary, assigning an index to each word: <br>
https://pytorch.org/text/stable/vocab.html#build-vocab-from-iterator

In [None]:
# This tokenizer converts words to token ID numbers
tokenizer = get_tokenizer('basic_english')

print("Preprocessing")
remove_titles = lambda x: x[:2]!=' ='
remove_short_sequences = lambda x: len(x)>2*WINDOW
data = valid.filter(remove_titles) # <<<<<<<<<<<<<<<<<<<<<< Change XYZ.filter to XYZ split desired
data = data.map(tokenizer)
data = data.filter(remove_short_sequences)

vocab = build_vocab_from_iterator(data, min_freq=1, specials=["<unk>"])
VOCAB_SIZE = len(vocab)
data = data.map(vocab.lookup_indices)

In [None]:
print(f"Number of words in vocab: {VOCAB_SIZE}")

# Example of getting ID using vocab
print(
    vocab['cool'])

# We can tokenize a whole sentence like so:
vocab.lookup_indices(tokenizer('Josh is a cool guy'))

In [None]:
# Using a dataloader allows us to efficiently work with large amounts of data
print("Creating dataloader")
loader = DataLoader(
        dataset=data,
        batch_size=BATCH_SIZE,
        shuffle=True,
        drop_last=True,
        num_workers=NUM_WORKERS,
        collate_fn=lambda x: x) #This collate simply passes data through

In [None]:
def make_context(tokens, window):
    # This gives the context words within the window around the target word. For example:
    # the quick BROWN fox jumps
    # If the target word is "brown". For a window size of 2 the context words are:
    # "the, quick, fox, jumps"
    
    # The first/last "window" amount of words have their left/right context window instersect with the start/end
    # We still want 2*window words for our context, so we take the next closest words by
    # padding the start and end of a document with a "window" amount of words. For example:
    # fox jumps | the quick brown fox jumps
    # Where the "|" indicates the end of the padded portion. So if our window size is still 2 and the target
    # is "quick" then we only have one word in the left window in th original sequence, but with the padded one:
    # fox jumps | the QUICK brown fox jumps
    # Our left window now extends into the padded region. The effect is as if our left window size is one and
    # the right window size is three, but it makes it easier to code.
    
    padded = (tokens[window+1 : 2*window+1]
              + tokens
              + tokens[-(2*window+1) : -(window+1)])

    data = []
    for i in range(window, len(padded) - window):
        context = padded[i-window : i] + padded[i+1 : i+1+window]
        data.append(context)

    return torch.tensor(data)

In [None]:
class CBOW(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)        

    def forward(self, inputs):
        embeds = torch.sum(self.embeddings(inputs), dim=1)
        out = self.linear(embeds)
        return out

    def get_word_emdedding(self, word):
        word = torch.tensor([word_to_ix[word]])
        return self.embeddings(word).view(1,-1)

In [None]:
for idx, batch in enumerate(loader):
    for sample in batch:
        print(sample)
        break
    break

In [None]:
print("Sending model")
model = CBOW(len(vocab), EMBED_DIM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE)

In [None]:
print("Begin training")
for epoch in range(EPOCHS):
    for idx, batch in enumerate(loader):
        targets = torch.tensor(batch[0]).to(device)
        context = make_context(batch[0], WINDOW).to(device)
        out = model(context)
        loss = criterion(out, targets)
    
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if idx%200==0:
            print(idx, loss.item())

In [None]:
def predi(myin):
    myin = tokenizer(myin[0])
    myin = vocab.lookup_indices(myin)
    
    context = torch.tensor([myin]).to(device)
    output = model(context)
    print(vocab.lookup_tokens(torch.topk(output, 5)[1][0].tolist()))

In [None]:
predi(["powers or prerogatives of the Court or determine how judicial"]) # Supreme
predi(["not limit the subjects the Court may hear jurisdiction federal"]) # Supreme
predi(["Japanese artistic books that people buy to read cartoon drawings"]) # SYNTHETIC
predi(["signed contract with appeared several its campaigns one which featured"]) # advertising

In [None]:
def my_test(a,b):
    aa = model.embeddings(torch.tensor(a, dtype=torch.long).to(device))
    bb = model.embeddings(torch.tensor(b, dtype=torch.long).to(device))
    sim = torch.nn.CosineSimilarity(dim=0, eps=1e-6)
    return sim(aa,bb).item()

my_test('lobster','josh')
    