from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import os
import sys

directory = sys.argv[1]
# directory = 'path/to/directory'
file_list = os.listdir(directory)

# Loop through each file in the directory
for file in file_list:
    # Check if the file is a CSV file
    if file.endswith('.csv'):
        # Read the file into a dataframe
        file_path = os.path.join(directory, file)
        
		data = pd.read_csv(file_path, header=None)

		examples_text = data.loc[0:4][5]
		# In current *.csv files the example tweets start with some number
		examples_text = examples_text.str.split(n=1).str[1]

		# ST wants indexing to start at 0, so we reset the index and drop the old indices
		tweets_text = data.loc[5:][5].reset_index(drop=True)
		tweets_text = pd.concat([examples_text, tweets_text], ignore_index=True)

		# path to the folder that contains the config.json file
		# This uses the following HF model:
		#  https://huggingface.co/sentence-transformers/all-mpnet-base-v2
		model = SentenceTransformer('/projectnb/jbrcs/tweet2/models/mpnet')

		# Calculate sentence embeddings for examples and tweets
		examples = model.encode(examples_text)
		tweets = model.encode(tweets_text)

		similarity = util.cos_sim(examples, tweets)
		vals,inds = similarity.topk(5,dim=1)

		data = pd.concat([data,pd.DataFrame(similarity.numpy().T)],axis=1)
		data.to_csv('./data/biden2.csv', index=False, header=False)