{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sklearn\n", "import sklearn.model_selection\n", "from sklearn.metrics.pairwise import cosine_similarity\n", "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", "import pandas as pd\n", "import scipy\n", "import numpy as np\n", "\n", "df_eng = pd.read_csv('raw_texts.csv')\n", "df_akk = pd.read_csv('new.csv')\n", "# akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0)\n", "# eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0)\n", "tf_vectorizer = TfidfVectorizer(analyzer='word')\n", "# tf_vectorizer.fit(akk_raw_train['Text'].to_list())" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "tf_vectorizer = TfidfVectorizer(analyzer='word')\n", "save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list())\n", "# save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "tfidf_tokens = tf_vectorizer.get_feature_names_out()\n", "df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list())\n", "cc = cosine_similarity(save_vect,save_vect)\n", "bool_similarity = cc > 0.5\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "abcd = np.where((cc > 0.5)&( cc< 1))\n", "abcd[0].tofile(\"data.csv\", sep = \",\", format = \"%d\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using matplotlib backend: \n" ] } ], "source": [ "%matplotlib\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "f = sns.scatterplot(bool_similarity)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Project P394767\n", "Text x x x BAD₃-ku-ri-gal-zi x E₂ 44 ša₂ BAD₃-{d}su...\n", "Genre lexical\n", "Name: 4, dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_akk.iloc[4,:]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 2 }