from sentence_transformers import SentenceTransformer, util import pandas as pd import numpy as np import os import sys ###======User Defined Parameters========== # Column index of feedback in CSV data_column = 3 # Header row number to start parsing after data_row = 1 # How many total test examples are provided num_tests = 5 test_file = 'tests.csv' # Path to desired SentenceTransformer model (relative to MYPATH) model_path = 'models/sentence-transformers_all-mpnet-base-v2' ###======================================= # Get the path to the install folder set in run.sh MYPATH = os.getenv('MYPATH') # Define a "default" directory to use, which can be overridden at the # command-line by providing the desired directory as an argument if len(sys.argv) > 1: directory = sys.argv[1] else: directory = MYPATH+'data' print('Reading data from ' + directory) file_list = os.listdir(directory) # Loop through each file in the directory for file in file_list: # Check if the file is a CSV file if file.endswith('.csv'): print('Computing similarity for '+file) # Read the file into a dataframe file_path = os.path.join(directory, file) data = pd.read_csv(file_path, keep_default_na=False, header=None, encoding = 'unicode_escape', engine ='python') tests = pd.read_csv(MYPATH+test_file, keep_default_na=False, header=None, encoding = 'unicode_escape', engine ='python') # Read the tests, that all the data is compared against tests_text = tests.loc[:][0] # SentenceTransformer wants indexing to start at 0, so we reset # the index and drop the old indices feedback_text = data.loc[data_row:][data_column].reset_index(drop=True) # path to the folder that contains the config.json file # This uses the following HF model: # https://huggingface.co/sentence-transformers/all-mpnet-base-v2 model = SentenceTransformer(MYPATH+model_path) # Calculate sentence embeddings for examples and tests tests = model.encode(tests_text) feedback = model.encode(feedback_text) # Calc similarity scores, their average, and add headers similarity = util.cos_sim(tests, feedback) similarity = pd.DataFrame(similarity.numpy().T) data = pd.concat([data,similarity],axis=1) # Reset so all columns have a unique name data.columns = range(data.columns.size) # Write out processed feedback with computed similarity appeneded data.to_csv(os.path.join(directory, file), index=False, header=False)