{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Imported modules" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import scrapping\n", "import psycopg2\n", "from psycopg2.extras import execute_batch\n", "import re\n", "import logging\n", "import datetime\n", "import multiprocessing\n", "import os\n", "logging.basicConfig(\n", " level=logging.INFO, filename=f'{datetime.datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")}.log', filemode='w',\n", " format='%(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')\n", "# from psycopg2.extensions import register_adapter\n" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n", "connection = conn.cursor()\n", "with open('project_list') as f:\n", " for project in f.read().split('\\n'):\n", " # connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n", " try:\n", " scrap = scrapping.get_raw_english_texts_of_project(project)\n", " except Exception:\n", " logging.error(f\"Error in {project}:{Exception}\")\n", " try:\n", " execute_batch(\n", " connection,\n", " \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING\",\n", " scrap)\n", " conn.commit()\n", " except Exception:\n", " with open(project, 'w') as f:\n", " f.write(json.dumps(scrap))\n", " logging.error(f\"Error in {project}:{Exception}\")" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n", "connection = conn.cursor()\n", "logging.basicConfig(\n", " level=logging.INFO, filename=f'./now.log',\n", " format='%(name)s - %(levelname)s - %(message)s')\n", "logging.info('Start')\n", "with open('project_list', encoding='unicode-escape') as f:\n", " # project = f.read().split('\\n')[2]\n", " for project in f.read().split('\\n'):\n", " # connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n", " if os.path.exists(project):\n", " with open(project) as f:\n", " logging.info(f'reading {project}')\n", " scrap = json.loads(f.read())\n", " # scrap = json.loads(project)\n", " execute_batch(\n", " connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING\",\n", " scrap)\n", " conn.commit()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# with open('cams') as file:\n", "conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n", "connection = conn.cursor()\n", "scrap = scrapping.get_raw_english_texts_of_project('saao')\n", "execute_batch(\n", " connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING\",\n", " scrap)\n", "conn.commit()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 2 }