from sentence_transformers import SentenceTransformer, util
import pandas as pd
import numpy as np
import os
import sys

###======User Defined Parameters==========
# Column index of feedback in CSV
data_column = 3
# Header row number to start parsing after
data_row = 1
# How many total test examples are provided
num_tests = 5
test_file = 'tests.csv'
# Path to desired SentenceTransformer model (relative to MYPATH)
model_path = 'models/sentence-transformers_all-mpnet-base-v2'
###=======================================

# Get the path to the install folder set in run.sh
MYPATH = os.getenv('MYPATH')

# Define a "default" directory to use, which can be overridden at the
#     command-line by providing the desired directory as an argument
if len(sys.argv) > 1:
	directory = sys.argv[1]
else:
	directory = MYPATH+'data'
print('Reading data from ' + directory)

file_list = os.listdir(directory)
# Loop through each file in the directory
for file in file_list:
	# Check if the file is a CSV file
	if file.endswith('.csv'):
		print('Computing similarity for '+file)
		# Read the file into a dataframe
		file_path = os.path.join(directory, file)
		data = pd.read_csv(file_path, keep_default_na=False, header=None, encoding = 'unicode_escape', engine ='python')
		tests = pd.read_csv(MYPATH+test_file, keep_default_na=False, header=None, encoding = 'unicode_escape', engine ='python')

		# Read the tests, that all the data is compared against
		tests_text = tests.loc[:][0]

		# SentenceTransformer wants indexing to start at 0, so we reset
		#     the index and drop the old indices
		feedback_text = data.loc[data_row:][data_column].reset_index(drop=True)

		# path to the folder that contains the config.json file
		# This uses the following HF model:
		#  https://huggingface.co/sentence-transformers/all-mpnet-base-v2
		model = SentenceTransformer(MYPATH+model_path)

		# Calculate sentence embeddings for examples and tests
		tests = model.encode(tests_text)
		feedback = model.encode(feedback_text)

		# Calc similarity scores, their average, and add headers
		similarity = util.cos_sim(tests, feedback)
		similarity = pd.DataFrame(similarity.numpy().T)
		
		data = pd.concat([data,similarity],axis=1)
		# Reset so all columns have a unique name
		data.columns = range(data.columns.size)

		# Write out processed feedback with computed similarity appeneded
		data.to_csv(os.path.join(directory, file), index=False, header=False)