Compare commits

..

2 Commits

Author SHA1 Message Date
03f1d663d0 boolean similarity 2023-06-26 23:21:34 +03:00
db8244d902 update 2023-06-26 23:12:28 +03:00
9 changed files with 82358 additions and 12 deletions

0
2023-05-15_21-45-30.log Normal file
View File

Binary file not shown.

110
datat.ipynb Normal file
View File

@@ -0,0 +1,110 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import sklearn\n",
"import sklearn.model_selection\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
"import pandas as pd\n",
"import scipy\n",
"\n",
"df_eng = pd.read_csv('raw_texts.csv')\n",
"df_akk = pd.read_csv('new.csv')\n",
"# akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0)\n",
"# eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0)\n",
"tf_vectorizer = TfidfVectorizer(analyzer='word')\n",
"# tf_vectorizer.fit(akk_raw_train['Text'].to_list())"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"tf_vectorizer = TfidfVectorizer(analyzer='word')\n",
"save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list())\n",
"# save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"tfidf_tokens = tf_vectorizer.get_feature_names_out()\n",
"df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list())\n",
"cc = cosine_similarity(save_vect,save_vect)\n",
"bool_similarity = cc > 0.5\n"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "Cannot set a DataFrame with multiple columns to the single column Genre",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[22], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m df_genre \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(\u001b[39m'\u001b[39m\u001b[39mgenre.csv\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m df_akk[\u001b[39m\"\u001b[39;49m\u001b[39mGenre\u001b[39;49m\u001b[39m\"\u001b[39;49m] \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mconcat([df_genre, df_akk], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m, join\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39minner\u001b[39m\u001b[39m'\u001b[39m, keys\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mProject\u001b[39m\u001b[39m'\u001b[39m)\n",
"File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\venv\\lib\\site-packages\\pandas\\core\\frame.py:3949\u001b[0m, in \u001b[0;36mDataFrame.__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 3947\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_setitem_array(key, value)\n\u001b[0;32m 3948\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(value, DataFrame):\n\u001b[1;32m-> 3949\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_set_item_frame_value(key, value)\n\u001b[0;32m 3950\u001b[0m \u001b[39melif\u001b[39;00m (\n\u001b[0;32m 3951\u001b[0m is_list_like(value)\n\u001b[0;32m 3952\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcolumns\u001b[39m.\u001b[39mis_unique\n\u001b[0;32m 3953\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39m1\u001b[39m \u001b[39m<\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcolumns\u001b[39m.\u001b[39mget_indexer_for([key])) \u001b[39m==\u001b[39m \u001b[39mlen\u001b[39m(value)\n\u001b[0;32m 3954\u001b[0m ):\n\u001b[0;32m 3955\u001b[0m \u001b[39m# Column to set is duplicated\u001b[39;00m\n\u001b[0;32m 3956\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_setitem_array([key], value)\n",
"File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\venv\\lib\\site-packages\\pandas\\core\\frame.py:4103\u001b[0m, in \u001b[0;36mDataFrame._set_item_frame_value\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 4100\u001b[0m \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m 4102\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(value\u001b[39m.\u001b[39mcolumns) \u001b[39m!=\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m-> 4103\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 4104\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCannot set a DataFrame with multiple columns to the single \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 4105\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcolumn \u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 4106\u001b[0m )\n\u001b[0;32m 4108\u001b[0m \u001b[39mself\u001b[39m[key] \u001b[39m=\u001b[39m value[value\u001b[39m.\u001b[39mcolumns[\u001b[39m0\u001b[39m]]\n",
"\u001b[1;31mValueError\u001b[0m: Cannot set a DataFrame with multiple columns to the single column Genre"
]
}
],
"source": [
"df_genre = pd.read_csv('genre.csv')\n",
"df_akk[\"Genre\"] = pd.concat([df_genre, df_akk], axis=1, join='inner', keys='Project')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

25491
genre.csv Normal file

File diff suppressed because it is too large Load Diff

12998
new.csv Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,15 +1,5 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"installations\n",
"\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 2,
@@ -763,7 +753,7 @@
"metadata": {},
"outputs": [],
"source": [
"def "
"#def "
]
},
{
@@ -86883,7 +86873,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.2"
"version": "3.9.13"
}
},
"nbformat": 4,

52
projects.csv Normal file
View File

@@ -0,0 +1,52 @@
1,adsd
2,aemw
3,akklove
4,amgg
5,ario
6,armep
7,arrim
8,asbp
9,atae
10,babcity
11,blms
12,btmao
13,btto
14,cams
15,caspo
16,ccpo
17,cdli
18,ckst
19,cmawro
20,contrib
23,ctij
24,dcclt
25,dccmt
26,doc
27,dsst
28,ecut
29,eisl
30,epsd2
31,etcsri
32,glass
33,hbtin
34,lacost
35,lovelyrics
36,neo
37,nere
38,nimrud
39,obel
40,obmc
41,obta
42,ogsl
43,oimea
44,pnao
45,qcat
46,riao
47,ribo
48,rimanum
49,rinap
50,saao
51,suhu
52,tcma
53,tsae
54,xcat
1 1 adsd
2 2 aemw
3 3 akklove
4 4 amgg
5 5 ario
6 6 armep
7 7 arrim
8 8 asbp
9 9 atae
10 10 babcity
11 11 blms
12 12 btmao
13 13 btto
14 14 cams
15 15 caspo
16 16 ccpo
17 17 cdli
18 18 ckst
19 19 cmawro
20 20 contrib
21 23 ctij
22 24 dcclt
23 25 dccmt
24 26 doc
25 27 dsst
26 28 ecut
27 29 eisl
28 30 epsd2
29 31 etcsri
30 32 glass
31 33 hbtin
32 34 lacost
33 35 lovelyrics
34 36 neo
35 37 nere
36 38 nimrud
37 39 obel
38 40 obmc
39 41 obta
40 42 ogsl
41 43 oimea
42 44 pnao
43 45 qcat
44 46 riao
45 47 ribo
46 48 rimanum
47 49 rinap
48 50 saao
49 51 suhu
50 52 tcma
51 53 tsae
52 54 xcat

1
raw_json.csv Normal file

File diff suppressed because one or more lines are too long

43704
raw_texts.csv Normal file

File diff suppressed because one or more lines are too long