6.0 KiB
6.0 KiB
In [3]:
import sklearn import sklearn.model_selection from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer import pandas as pd import scipy df_eng = pd.read_csv('raw_texts.csv') df_akk = pd.read_csv('new.csv') # akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0) # eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0) tf_vectorizer = TfidfVectorizer(analyzer='word') # tf_vectorizer.fit(akk_raw_train['Text'].to_list())
In [4]:
tf_vectorizer = TfidfVectorizer(analyzer='word') save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list()) # save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])
In [5]:
tfidf_tokens = tf_vectorizer.get_feature_names_out() df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)
In [6]:
test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list()) cc = cosine_similarity(save_vect,save_vect) bool_similarity = cc > 0.5
In [22]:
df_genre = pd.read_csv('genre.csv') df_akk["Genre"] = pd.concat([df_genre, df_akk], axis=1, join='inner', keys='Project')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[22], line 2 1 df_genre = pd.read_csv('genre.csv') ----> 2 df_akk["Genre"] = pd.concat([df_genre, df_akk], axis=1, join='inner', keys='Project') File c:\Users\Saret\Programming\C#\DH\venv\lib\site-packages\pandas\core\frame.py:3949, in DataFrame.__setitem__(self, key, value) 3947 self._setitem_array(key, value) 3948 elif isinstance(value, DataFrame): -> 3949 self._set_item_frame_value(key, value) 3950 elif ( 3951 is_list_like(value) 3952 and not self.columns.is_unique 3953 and 1 < len(self.columns.get_indexer_for([key])) == len(value) 3954 ): 3955 # Column to set is duplicated 3956 self._setitem_array([key], value) File c:\Users\Saret\Programming\C#\DH\venv\lib\site-packages\pandas\core\frame.py:4103, in DataFrame._set_item_frame_value(self, key, value) 4100 return 4102 if len(value.columns) != 1: -> 4103 raise ValueError( 4104 "Cannot set a DataFrame with multiple columns to the single " 4105 f"column {key}" 4106 ) 4108 self[key] = value[value.columns[0]] ValueError: Cannot set a DataFrame with multiple columns to the single column Genre
In [ ]: