from sentence_transformers import SentenceTransformer, util import pandas as pd import numpy as np import os import sys directory = sys.argv[1] # directory = 'path/to/directory' file_list = os.listdir(directory) # Loop through each file in the directory for file in file_list: # Check if the file is a CSV file if file.endswith('.csv'): # Read the file into a dataframe file_path = os.path.join(directory, file) data = pd.read_csv(file_path, header=None) examples_text = data.loc[0:4][5] # In current *.csv files the example tweets start with some number examples_text = examples_text.str.split(n=1).str[1] # ST wants indexing to start at 0, so we reset the index and drop the old indices tweets_text = data.loc[5:][5].reset_index(drop=True) tweets_text = pd.concat([examples_text, tweets_text], ignore_index=True) # path to the folder that contains the config.json file # This uses the following HF model: # https://huggingface.co/sentence-transformers/all-mpnet-base-v2 model = SentenceTransformer('/projectnb/jbrcs/tweet2/models/mpnet') # Calculate sentence embeddings for examples and tweets examples = model.encode(examples_text) tweets = model.encode(tweets_text) similarity = util.cos_sim(examples, tweets) vals,inds = similarity.topk(5,dim=1) data = pd.concat([data,pd.DataFrame(similarity.numpy().T)],axis=1) data.to_csv('./data/biden2.csv', index=False, header=False)