3.6 KiB
3.6 KiB
In [1]:
import sklearn import sklearn.model_selection from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer import pandas as pd import scipy import numpy as np df_eng = pd.read_csv('raw_texts.csv') df_akk = pd.read_csv('new.csv') # akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0) # eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0) tf_vectorizer = TfidfVectorizer(analyzer='word') # tf_vectorizer.fit(akk_raw_train['Text'].to_list())
In [2]:
tf_vectorizer = TfidfVectorizer(analyzer='word') save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list()) # save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])
In [3]:
tfidf_tokens = tf_vectorizer.get_feature_names_out() df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)
In [4]:
test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list()) cc = cosine_similarity(save_vect,save_vect) bool_similarity = cc > 0.5
In [5]:
abcd = np.where((cc > 0.5)&( cc< 1)) abcd[0].tofile("data.csv", sep = ",", format = "%d")
In [6]:
%matplotlib import matplotlib.pyplot as plt import seaborn as sns
Using matplotlib backend: <object object at 0x00000212CB626CA0>
In [ ]:
f = sns.scatterplot(bool_similarity)
In [15]:
df_akk.iloc[4,:]
Out[15]:
Project P394767 Text x x x BAD₃-ku-ri-gal-zi x E₂ 44 ša₂ BAD₃-{d}su... Genre lexical Name: 4, dtype: object
In [ ]: