Compare commits
2 Commits
26bbbe7d8c
...
03f1d663d0
Author | SHA1 | Date | |
---|---|---|---|
03f1d663d0 | |||
db8244d902 |
0
2023-05-15_21-45-30.log
Normal file
0
2023-05-15_21-45-30.log
Normal file
Binary file not shown.
110
datat.ipynb
Normal file
110
datat.ipynb
Normal file
@@ -0,0 +1,110 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import sklearn\n",
|
||||||
|
"import sklearn.model_selection\n",
|
||||||
|
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
||||||
|
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"import scipy\n",
|
||||||
|
"\n",
|
||||||
|
"df_eng = pd.read_csv('raw_texts.csv')\n",
|
||||||
|
"df_akk = pd.read_csv('new.csv')\n",
|
||||||
|
"# akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0)\n",
|
||||||
|
"# eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0)\n",
|
||||||
|
"tf_vectorizer = TfidfVectorizer(analyzer='word')\n",
|
||||||
|
"# tf_vectorizer.fit(akk_raw_train['Text'].to_list())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tf_vectorizer = TfidfVectorizer(analyzer='word')\n",
|
||||||
|
"save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list())\n",
|
||||||
|
"# save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tfidf_tokens = tf_vectorizer.get_feature_names_out()\n",
|
||||||
|
"df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list())\n",
|
||||||
|
"cc = cosine_similarity(save_vect,save_vect)\n",
|
||||||
|
"bool_similarity = cc > 0.5\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"ename": "ValueError",
|
||||||
|
"evalue": "Cannot set a DataFrame with multiple columns to the single column Genre",
|
||||||
|
"output_type": "error",
|
||||||
|
"traceback": [
|
||||||
|
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||||
|
"Cell \u001b[1;32mIn[22], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m df_genre \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(\u001b[39m'\u001b[39m\u001b[39mgenre.csv\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m df_akk[\u001b[39m\"\u001b[39;49m\u001b[39mGenre\u001b[39;49m\u001b[39m\"\u001b[39;49m] \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mconcat([df_genre, df_akk], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m, join\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39minner\u001b[39m\u001b[39m'\u001b[39m, keys\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mProject\u001b[39m\u001b[39m'\u001b[39m)\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\venv\\lib\\site-packages\\pandas\\core\\frame.py:3949\u001b[0m, in \u001b[0;36mDataFrame.__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 3947\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_setitem_array(key, value)\n\u001b[0;32m 3948\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(value, DataFrame):\n\u001b[1;32m-> 3949\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_set_item_frame_value(key, value)\n\u001b[0;32m 3950\u001b[0m \u001b[39melif\u001b[39;00m (\n\u001b[0;32m 3951\u001b[0m is_list_like(value)\n\u001b[0;32m 3952\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcolumns\u001b[39m.\u001b[39mis_unique\n\u001b[0;32m 3953\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39m1\u001b[39m \u001b[39m<\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcolumns\u001b[39m.\u001b[39mget_indexer_for([key])) \u001b[39m==\u001b[39m \u001b[39mlen\u001b[39m(value)\n\u001b[0;32m 3954\u001b[0m ):\n\u001b[0;32m 3955\u001b[0m \u001b[39m# Column to set is duplicated\u001b[39;00m\n\u001b[0;32m 3956\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_setitem_array([key], value)\n",
|
||||||
|
"File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\venv\\lib\\site-packages\\pandas\\core\\frame.py:4103\u001b[0m, in \u001b[0;36mDataFrame._set_item_frame_value\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 4100\u001b[0m \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m 4102\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(value\u001b[39m.\u001b[39mcolumns) \u001b[39m!=\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m-> 4103\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 4104\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCannot set a DataFrame with multiple columns to the single \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 4105\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcolumn \u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 4106\u001b[0m )\n\u001b[0;32m 4108\u001b[0m \u001b[39mself\u001b[39m[key] \u001b[39m=\u001b[39m value[value\u001b[39m.\u001b[39mcolumns[\u001b[39m0\u001b[39m]]\n",
|
||||||
|
"\u001b[1;31mValueError\u001b[0m: Cannot set a DataFrame with multiple columns to the single column Genre"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"df_genre = pd.read_csv('genre.csv')\n",
|
||||||
|
"df_akk[\"Genre\"] = pd.concat([df_genre, df_akk], axis=1, join='inner', keys='Project')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.13"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
@@ -1,15 +1,5 @@
|
|||||||
{
|
{
|
||||||
"cells": [
|
"cells": [
|
||||||
{
|
|
||||||
"cell_type": "markdown",
|
|
||||||
"metadata": {},
|
|
||||||
"source": [
|
|
||||||
"```\n",
|
|
||||||
"installations\n",
|
|
||||||
"\n",
|
|
||||||
"```"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 2,
|
"execution_count": 2,
|
||||||
@@ -763,7 +753,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"def "
|
"#def "
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@@ -86883,7 +86873,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.2"
|
"version": "3.9.13"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
52
projects.csv
Normal file
52
projects.csv
Normal file
@@ -0,0 +1,52 @@
|
|||||||
|
1,adsd
|
||||||
|
2,aemw
|
||||||
|
3,akklove
|
||||||
|
4,amgg
|
||||||
|
5,ario
|
||||||
|
6,armep
|
||||||
|
7,arrim
|
||||||
|
8,asbp
|
||||||
|
9,atae
|
||||||
|
10,babcity
|
||||||
|
11,blms
|
||||||
|
12,btmao
|
||||||
|
13,btto
|
||||||
|
14,cams
|
||||||
|
15,caspo
|
||||||
|
16,ccpo
|
||||||
|
17,cdli
|
||||||
|
18,ckst
|
||||||
|
19,cmawro
|
||||||
|
20,contrib
|
||||||
|
23,ctij
|
||||||
|
24,dcclt
|
||||||
|
25,dccmt
|
||||||
|
26,doc
|
||||||
|
27,dsst
|
||||||
|
28,ecut
|
||||||
|
29,eisl
|
||||||
|
30,epsd2
|
||||||
|
31,etcsri
|
||||||
|
32,glass
|
||||||
|
33,hbtin
|
||||||
|
34,lacost
|
||||||
|
35,lovelyrics
|
||||||
|
36,neo
|
||||||
|
37,nere
|
||||||
|
38,nimrud
|
||||||
|
39,obel
|
||||||
|
40,obmc
|
||||||
|
41,obta
|
||||||
|
42,ogsl
|
||||||
|
43,oimea
|
||||||
|
44,pnao
|
||||||
|
45,qcat
|
||||||
|
46,riao
|
||||||
|
47,ribo
|
||||||
|
48,rimanum
|
||||||
|
49,rinap
|
||||||
|
50,saao
|
||||||
|
51,suhu
|
||||||
|
52,tcma
|
||||||
|
53,tsae
|
||||||
|
54,xcat
|
|
1
raw_json.csv
Normal file
1
raw_json.csv
Normal file
File diff suppressed because one or more lines are too long
43704
raw_texts.csv
Normal file
43704
raw_texts.csv
Normal file
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user