{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Imported modules" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import scrapping\n", "import psycopg2\n", "from psycopg2.extras import execute_batch\n", "import re\n", "import logging\n", "import datetime\n", "logging.basicConfig(\n", " level=logging.INFO, filename=f'{datetime.datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")}.log', filemode='w',\n", " format='%(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')\n", "# from psycopg2.extensions import register_adapter\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "ename": "TypeError", "evalue": "'NoneType' object is not subscriptable", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", "Cell \u001b[1;32mIn[2], line 6\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mproject_list\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[0;32m 4\u001b[0m \u001b[39mfor\u001b[39;00m project \u001b[39min\u001b[39;00m f\u001b[39m.\u001b[39mread()\u001b[39m.\u001b[39msplit(\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m'\u001b[39m):\n\u001b[0;32m 5\u001b[0m \u001b[39m# connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\u001b[39;00m\n\u001b[1;32m----> 6\u001b[0m scrap \u001b[39m=\u001b[39m scrapping\u001b[39m.\u001b[39;49mget_raw_english_texts_of_project(project)\n\u001b[0;32m 7\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 8\u001b[0m execute_batch(connection, \u001b[39m\"\u001b[39m\u001b[39minsert into raw_texts values (\u001b[39m\u001b[39m%(id_text)s\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m%(project_name)s\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m%(raw_text)s\u001b[39;00m\u001b[39m)\u001b[39m\u001b[39m\"\u001b[39m, scrap)\n", "File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\scrapping.py:46\u001b[0m, in \u001b[0;36mget_raw_english_texts_of_project\u001b[1;34m(project_dirname)\u001b[0m\n\u001b[0;32m 44\u001b[0m \u001b[39mfor\u001b[39;00m filename \u001b[39min\u001b[39;00m all_paths:\n\u001b[0;32m 45\u001b[0m cur_json \u001b[39m=\u001b[39m _load_json_from_path(filename)\n\u001b[1;32m---> 46\u001b[0m project_name \u001b[39m=\u001b[39m cur_json[\u001b[39m'\u001b[39;49m\u001b[39mproject\u001b[39;49m\u001b[39m'\u001b[39;49m]\n\u001b[0;32m 48\u001b[0m \u001b[39m# # Skip in case we are in saa project and the current sub project is not in neo-assyrian\u001b[39;00m\n\u001b[0;32m 49\u001b[0m \u001b[39m# if project_dirname == \"saao\" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate\u001b[39;00m\n\u001b[0;32m 50\u001b[0m \u001b[39m# continue\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \n\u001b[0;32m 52\u001b[0m \u001b[39m# id_text = member.get('id_text', \"\") + member.get('id_composite', \"\")\u001b[39;00m\n\u001b[0;32m 53\u001b[0m \u001b[39m# html_dir = \"/\".join(path.parts[1:-1])\u001b[39;00m\n\u001b[0;32m 54\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mhttp://oracc.iaas.upenn.edu/\u001b[39m\u001b[39m{\u001b[39;00mproject_name\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00mcur_json[\u001b[39m'\u001b[39m\u001b[39mtextid\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m}\u001b[39;00m\u001b[39m/html\u001b[39m\u001b[39m\"\u001b[39m\n", "\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable" ] } ], "source": [ "conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n", "connection = conn.cursor()\n", "with open('project_list') as f:\n", " for project in f.read().split('\\n'):\n", " # connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n", " scrap = scrapping.get_raw_english_texts_of_project(project)\n", " try:\n", " execute_batch(connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n", " conn.commit()\n", " except Exception:\n", " with open(project, 'w') as f:\n", " f.write(json.dumps(scrap))\n", " logging.error(f\"Error in {project}:{Exception}\")" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n", "connection = conn.cursor()\n", "logging.basicConfig(\n", " level=logging.INFO, filename=f'./now.log',\n", " format='%(name)s - %(levelname)s - %(message)s')\n", "logging.info('Start')\n", "with open('project_list') as f:\n", " project = f.read().split('\\n')[2]\n", " scrap = scrapping.get_raw_english_texts_of_project(project)\n", " execute_batch(connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n", " conn.commit()\n" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }