diff --git a/2023-05-15_21-45-30.log b/2023-05-15_21-45-30.log new file mode 100644 index 0000000..e69de29 diff --git a/__pycache__/scrapping.cpython-39.pyc b/__pycache__/scrapping.cpython-39.pyc index 140d38a..7bb34e9 100644 Binary files a/__pycache__/scrapping.cpython-39.pyc and b/__pycache__/scrapping.cpython-39.pyc differ diff --git a/datat.ipynb b/datat.ipynb new file mode 100644 index 0000000..9d37ac0 --- /dev/null +++ b/datat.ipynb @@ -0,0 +1,109 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "import sklearn.model_selection\n", + "from sklearn.metrics.pairwise import cosine_similarity\n", + "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", + "import pandas as pd\n", + "import scipy\n", + "\n", + "df_eng = pd.read_csv('raw_texts.csv')\n", + "df_akk = pd.read_csv('new.csv')\n", + "# akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0)\n", + "# eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0)\n", + "tf_vectorizer = TfidfVectorizer(analyzer='word')\n", + "# tf_vectorizer.fit(akk_raw_train['Text'].to_list())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "tf_vectorizer = TfidfVectorizer(analyzer='word')\n", + "save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list())\n", + "# save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "tfidf_tokens = tf_vectorizer.get_feature_names_out()\n", + "df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list())\n", + "cc = cosine_similarity(save_vect,save_vect)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "Cannot set a DataFrame with multiple columns to the single column Genre", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[1;32mIn[22], line 2\u001b[0m\n\u001b[0;32m 1\u001b[0m df_genre \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mread_csv(\u001b[39m'\u001b[39m\u001b[39mgenre.csv\u001b[39m\u001b[39m'\u001b[39m)\n\u001b[1;32m----> 2\u001b[0m df_akk[\u001b[39m\"\u001b[39;49m\u001b[39mGenre\u001b[39;49m\u001b[39m\"\u001b[39;49m] \u001b[39m=\u001b[39m pd\u001b[39m.\u001b[39mconcat([df_genre, df_akk], axis\u001b[39m=\u001b[39m\u001b[39m1\u001b[39m, join\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39minner\u001b[39m\u001b[39m'\u001b[39m, keys\u001b[39m=\u001b[39m\u001b[39m'\u001b[39m\u001b[39mProject\u001b[39m\u001b[39m'\u001b[39m)\n", + "File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\venv\\lib\\site-packages\\pandas\\core\\frame.py:3949\u001b[0m, in \u001b[0;36mDataFrame.__setitem__\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 3947\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_setitem_array(key, value)\n\u001b[0;32m 3948\u001b[0m \u001b[39melif\u001b[39;00m \u001b[39misinstance\u001b[39m(value, DataFrame):\n\u001b[1;32m-> 3949\u001b[0m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_set_item_frame_value(key, value)\n\u001b[0;32m 3950\u001b[0m \u001b[39melif\u001b[39;00m (\n\u001b[0;32m 3951\u001b[0m is_list_like(value)\n\u001b[0;32m 3952\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcolumns\u001b[39m.\u001b[39mis_unique\n\u001b[0;32m 3953\u001b[0m \u001b[39mand\u001b[39;00m \u001b[39m1\u001b[39m \u001b[39m<\u001b[39m \u001b[39mlen\u001b[39m(\u001b[39mself\u001b[39m\u001b[39m.\u001b[39mcolumns\u001b[39m.\u001b[39mget_indexer_for([key])) \u001b[39m==\u001b[39m \u001b[39mlen\u001b[39m(value)\n\u001b[0;32m 3954\u001b[0m ):\n\u001b[0;32m 3955\u001b[0m \u001b[39m# Column to set is duplicated\u001b[39;00m\n\u001b[0;32m 3956\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_setitem_array([key], value)\n", + "File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\venv\\lib\\site-packages\\pandas\\core\\frame.py:4103\u001b[0m, in \u001b[0;36mDataFrame._set_item_frame_value\u001b[1;34m(self, key, value)\u001b[0m\n\u001b[0;32m 4100\u001b[0m \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m 4102\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mlen\u001b[39m(value\u001b[39m.\u001b[39mcolumns) \u001b[39m!=\u001b[39m \u001b[39m1\u001b[39m:\n\u001b[1;32m-> 4103\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[0;32m 4104\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mCannot set a DataFrame with multiple columns to the single \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m 4105\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mcolumn \u001b[39m\u001b[39m{\u001b[39;00mkey\u001b[39m}\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m 4106\u001b[0m )\n\u001b[0;32m 4108\u001b[0m \u001b[39mself\u001b[39m[key] \u001b[39m=\u001b[39m value[value\u001b[39m.\u001b[39mcolumns[\u001b[39m0\u001b[39m]]\n", + "\u001b[1;31mValueError\u001b[0m: Cannot set a DataFrame with multiple columns to the single column Genre" + ] + } + ], + "source": [ + "df_genre = pd.read_csv('genre.csv')\n", + "df_akk[\"Genre\"] = pd.concat([df_genre, df_akk], axis=1, join='inner', keys='Project')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/project_notebook.ipynb b/project_notebook.ipynb index 05b2dd2..8facda4 100644 --- a/project_notebook.ipynb +++ b/project_notebook.ipynb @@ -1,15 +1,5 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "```\n", - "installations\n", - "\n", - "```" - ] - }, { "cell_type": "code", "execution_count": 2, @@ -763,7 +753,7 @@ "metadata": {}, "outputs": [], "source": [ - "def " + "#def " ] }, { @@ -86883,7 +86873,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.2" + "version": "3.9.13" } }, "nbformat": 4,