Files
DH/datat.ipynb
2023-08-12 18:19:37 +03:00

3.6 KiB

In [1]:
import sklearn
import sklearn.model_selection
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import scipy
import numpy as np

df_eng = pd.read_csv('raw_texts.csv')
df_akk = pd.read_csv('new.csv')
# akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0)
# eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0)
tf_vectorizer = TfidfVectorizer(analyzer='word')
# tf_vectorizer.fit(akk_raw_train['Text'].to_list())
In [2]:
tf_vectorizer = TfidfVectorizer(analyzer='word')
save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list())
# save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])
In [3]:
tfidf_tokens = tf_vectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)
In [4]:
test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list())
cc = cosine_similarity(save_vect,save_vect)
bool_similarity = cc > 0.5
In [5]:
abcd = np.where((cc > 0.5)&( cc< 1))
abcd[0].tofile("data.csv", sep = ",", format = "%d")
In [6]:
%matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
Using matplotlib backend: <object object at 0x00000212CB626CA0>
In [ ]:
f = sns.scatterplot(bool_similarity)
In [15]:
df_akk.iloc[4,:]
Out[15]:
Project                                              P394767
Text       x x x BAD₃-ku-ri-gal-zi x E₂ 44 ša₂ BAD₃-{d}su...
Genre                                                lexical
Name: 4, dtype: object
In [ ]: