import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from ast import literal_eval
df = pd.read_csv('output/embedded_babbage_similarity_50k.csv', index_col=0) # note that you will need to generate this file to run the code below
df['babbage_similarity'] = df.babbage_similarity.apply(literal_eval).apply(np.array)
X_train, X_test, y_train, y_test = train_test_split(df, df.Score, test_size = 0.2, random_state=42)
user_embeddings = X_train.groupby('UserId').babbage_similarity.apply(np.mean)
prod_embeddings = X_train.groupby('ProductId').babbage_similarity.apply(np.mean)
len(user_embeddings), len(prod_embeddings)