Files
DH/datat.ipynb
2023-06-26 23:12:28 +03:00

6.0 KiB

In [13]:
import sklearn
import sklearn.model_selection
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import pandas as pd
import scipy

df_eng = pd.read_csv('raw_texts.csv')
df_akk = pd.read_csv('new.csv')
# akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0)
# eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0)
tf_vectorizer = TfidfVectorizer(analyzer='word')
# tf_vectorizer.fit(akk_raw_train['Text'].to_list())
In [14]:
tf_vectorizer = TfidfVectorizer(analyzer='word')
save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list())
# save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])
In [15]:
tfidf_tokens = tf_vectorizer.get_feature_names_out()
df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)
In [16]:
test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list())
cc = cosine_similarity(save_vect,save_vect)
In [22]:
df_genre = pd.read_csv('genre.csv')
df_akk["Genre"] = pd.concat([df_genre, df_akk], axis=1, join='inner', keys='Project')
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[22], line 2
      1 df_genre = pd.read_csv('genre.csv')
----> 2 df_akk["Genre"] = pd.concat([df_genre, df_akk], axis=1, join='inner', keys='Project')

File c:\Users\Saret\Programming\C#\DH\venv\lib\site-packages\pandas\core\frame.py:3949, in DataFrame.__setitem__(self, key, value)
   3947     self._setitem_array(key, value)
   3948 elif isinstance(value, DataFrame):
-> 3949     self._set_item_frame_value(key, value)
   3950 elif (
   3951     is_list_like(value)
   3952     and not self.columns.is_unique
   3953     and 1 < len(self.columns.get_indexer_for([key])) == len(value)
   3954 ):
   3955     # Column to set is duplicated
   3956     self._setitem_array([key], value)

File c:\Users\Saret\Programming\C#\DH\venv\lib\site-packages\pandas\core\frame.py:4103, in DataFrame._set_item_frame_value(self, key, value)
   4100     return
   4102 if len(value.columns) != 1:
-> 4103     raise ValueError(
   4104         "Cannot set a DataFrame with multiple columns to the single "
   4105         f"column {key}"
   4106     )
   4108 self[key] = value[value.columns[0]]

ValueError: Cannot set a DataFrame with multiple columns to the single column Genre
In [ ]: