update the progress

This commit is contained in:
2023-04-17 02:51:50 +03:00
parent a9e93bd99f
commit 89ec3a3578
4 changed files with 4831 additions and 57 deletions

1
jsons_unzipped Symbolic link
View File

@@ -0,0 +1 @@
C:/Users/Saret/WaitForIt/oracc/

4114
missing_list.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +1,24 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"```\n",
"installations\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# !pip install sshtunnel"
]
},
{
"cell_type": "markdown",
"metadata": {},
@@ -22,14 +41,32 @@
"import glob\n",
"import threading\n",
"import datetime\n",
"from sshtunnel import SSHTunnelForwarder\n",
"import multiprocessing\n",
"import os\n",
"from typing import Tuple\n",
"logging.basicConfig(\n",
" level=logging.INFO, filename=f'{datetime.datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")}.log', filemode='w',\n",
" level=logging.INFO, filename=f'logs/{datetime.datetime.now().strftime(\"%Y-%m-%d_%H-%M-%S\")}.log', filemode='w',\n",
" format='%(name)s - %(levelname)s - %(asctime)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')\n",
"# from psycopg2.extensions import register_adapter\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Web Scraping for English translations"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Functions"
]
},
{
"cell_type": "code",
"execution_count": 2,
@@ -38,19 +75,22 @@
"source": [
"def send_data_to_db(data: list):\n",
" try:\n",
" conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
" conn = psycopg2.connect(dbname='dh', user='dh', password='qwerty',\n",
" host='dh.saret.tk', port='5432', sslmode='require')\n",
" connection = conn.cursor()\n",
" execute_batch(\n",
" connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING\",\n",
" data)\n",
" connection,\n",
" \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING\", data)\n",
" conn.commit()\n",
" except Exception as e:\n",
" logging.error(f\"Error in {data}:{e}\")\n",
"\n",
"\n",
"def run_file():\n",
" pass\n",
"\n",
"def get_project_files(project_name: str)-> list:\n",
"\n",
"def get_project_files(project_name: str) -> list:\n",
" return glob.glob(f'jsons_unzipped/{project_name}/**/corpusjson/*.json', recursive=True) + glob.glob(\n",
" f'jsons_unzipped/{project_name}/corpusjson/*.json', recursive=True)\n",
"\n",
@@ -66,11 +106,12 @@
" logging.error(f\"Error in {filepath}\")\n",
" return None\n",
" url = f\"http://{oracc_site}/{project_name}/{cur_json['textid']}/html\"\n",
" send_data_to_db( [{\"id_text\": cur_json['textid'],\n",
" \"project_name\": project_name,\n",
" \"raw_text\": scrapping._clean_raw_text(scrapping._download_data_from_website(url))}])\n",
" send_data_to_db([{\"id_text\": cur_json['textid'],\n",
" \"project_name\": project_name,\n",
" \"raw_text\": scrapping._clean_raw_text(scrapping._download_data_from_website(url))}])\n",
"\n",
"def run_this_shit(project: str)->None:\n",
"\n",
"def run_this_shit(project: str) -> None:\n",
" for file in get_project_files(project):\n",
" theadme = threading.Thread(None, download_text, args=(file,))\n",
" try:\n",
@@ -80,22 +121,670 @@
"\n",
"\n",
"def run_this_other_shit() -> None:\n",
" with open('missing_list.txt') as list:\n",
" for file in list.readlines():\n",
" theadme = threading.Thread(None, download_text, args=(file[:-1],))\n",
" conn = psycopg2.connect(dbname='dh', user='dh', password='qwerty',\n",
" host='dh.saret.tk', port='5432', sslmode='require')\n",
" connection = conn.cursor()\n",
" connection.execute('select distinct(project_name) from raw_texts')\n",
" # project_lists = [p[0] for p in connection.fetchall()]\n",
" # connection.close()\n",
" for project in connection.fetchall():\n",
" for file in get_project_files(project[0]):\n",
" theadme = threading.Thread(None, download_text, args=(file,))\n",
" try:\n",
" theadme.start()\n",
" except Exception as e:\n",
" logging.error(e)\n"
" logging.error(f'{e}: {e.args}')\n",
"\n",
"\n",
"def create_missing_list():\n",
" conn = psycopg2.connect(dbname='dh', user='dh', password='qwerty',\n",
" host='dh.saret.tk', port='5432', sslmode='require')\n",
" connection = conn.cursor()\n",
" with open(\"project_list\") as file:\n",
" files = \"\\n\".join(['\\n'.join(get_project_files(project[:-1])) for project in file.readlines()])\n",
" with open(\"missing_list.txt\", \"w\") as missing_list:\n",
" connection.execute('select id_text from raw_texts')\n",
" ids = [nod[0] for nod in connection.fetchall()]\n",
" for _id in ids:\n",
" files = re.sub(f'jsons_unz.+{_id}.+\\n+', '', files)\n",
" logging.info(files.count('n\\n'))\n",
" missing_list.write(files)\n"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# grab grammatical transliteration"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"def connect_to_database(\n",
" database: str, user: str = None, password: str = None, host: str = None, port: int = None, sslmode: str=None) -> Tuple[\n",
" psycopg2.extensions.connection, psycopg2.extensions.cursor]:\n",
" if user is not None and password is not None and host is not None and port is not None and sslmode is not None:\n",
" conn = psycopg2.connect(f\"dbname='{database}' user='{user}' host='{host}' password='{password}' port='{port}' sslmode='{sslmode}'\")\n",
" return conn, conn.cursor()\n",
" try:\n",
" conn = psycopg2.connect(database)\n",
" return conn, conn.cursor()\n",
" except Exception as e:\n",
" logging.error(e)\n",
" return None, None\n",
"\n",
"\n",
"def get_grammatical_transliteration_from_json() -> None:\n",
" try:\n",
" conn, connection = connect_to_database(dbname='dh', user='dh', password='qwerty', host='dh.saret.tk', port='5432', sslmode='require')\n",
" connection.execute(\"select project from projects\")\n",
" for project in connection.fetchall():\n",
" data_project = scrapping.get_raw_akk_texts_of_project(project[0])\n",
" execute_batch(\n",
" connection,\n",
" 'update raw_texts set transliteration=%(raw_text)s where id_text like %(id_text)s',\n",
" data_project)\n",
" conn.commit()\n",
" except Exception as e:\n",
" logging.error(f'{e}: {e.args}')\n",
"\n",
"def get_grammatical_transliteration_from_json_of_all_data() -> None:\n",
" conn, connection = connect_to_database(dbname='dh', user='dh', password='qwerty', host='dh.saret.tk', port='5432', sslmode='require')\n",
" connection.execute(\"select project from projects\")\n",
" for project in connection.fetchall():\n",
" try:\n",
" data_project = scrapping.get_raw_akk_texts_of_project(project[0])\n",
" execute_batch(connection, 'insert into new values (%(id_text)s, %(raw_text)s) on conflict do nothing', data_project)\n",
" conn.commit()\n",
" except Exception as e:\n",
" logging.error(f'{e}: {e.args}')\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"In file jsons_unzipped/adsd\\adart1\\corpusjson\\X103280.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/adsd\\adart2\\corpusjson\\X201680.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ario\\corpusjson\\Q007131.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ario\\corpusjson\\Q007189.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/atae\\guzana\\corpusjson\\X214098.json failed because of 'cdl'\n",
"In file jsons_unzipped/atae\\guzana\\corpusjson\\X214099.json failed because of 'cdl'\n",
"In file jsons_unzipped/atae\\guzana\\corpusjson\\X214100.json failed because of 'cdl'\n",
"In file jsons_unzipped/atae\\mallanate\\corpusjson\\X217065.json failed because of 'cdl'\n",
"In file jsons_unzipped/atae\\marqasu\\corpusjson\\P522581.json failed because of 'cdl'\n",
"In file jsons_unzipped/atae\\marqasu\\corpusjson\\P522591.json failed because of 'cdl'\n",
"In file jsons_unzipped/atae\\tilbarsip\\corpusjson\\P522609.json failed because of 'cdl'\n",
"In file jsons_unzipped/atae\\tilbarsip\\corpusjson\\P522611.json failed because of 'cdl'\n",
"In file jsons_unzipped/btto\\corpusjson\\Q004800.json failed because of 'cdl'\n",
"In file jsons_unzipped/btto/corpusjson\\Q004800.json failed because of 'cdl'\n",
"In file jsons_unzipped/cams\\gkab\\corpusjson\\P363695.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ccpo\\corpusjson\\P293337.json failed because of 'cdl'\n",
"In file jsons_unzipped/ctij\\corpusjson\\P261609.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ctij\\corpusjson\\X000533.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ctij/corpusjson\\P261609.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ctij/corpusjson\\X000533.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P000725.json failed because of 'cdl'\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P000726.json failed because of 'cdl'\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P000727.json failed because of 'cdl'\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P000728.json failed because of 'cdl'\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P010090.json failed because of 'cdl'\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P010091.json failed because of 'cdl'\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P010095.json failed because of 'cdl'\n",
"In file jsons_unzipped/dcclt\\corpusjson\\P010098.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P216852.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P254972.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P255048.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P255049.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P255051.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P368205.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P368236.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt\\corpusjson\\P368265.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P216852.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P254972.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P255048.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P255049.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P255051.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P368205.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P368236.json failed because of 'cdl'\n",
"In file jsons_unzipped/dccmt/corpusjson\\P368265.json failed because of 'cdl'\n",
"In file jsons_unzipped/ecut\\corpusjson\\Q000000.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ecut\\corpusjson\\Q006881.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ecut\\corpusjson\\Q006947.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\P395659.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003065.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003094.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003095.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003096.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003097.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003100.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003101.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003102.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003103.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003104.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003105.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003107.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003108.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003109.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003110.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003111.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003112.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003113.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003115.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003116.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003117.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003119.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003120.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003121.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003122.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003126.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003127.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003128.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003129.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003130.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003131.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003132.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003134.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003135.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003136.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003137.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003138.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003139.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003140.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003141.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003142.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003143.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003144.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003145.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003146.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003148.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003149.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003150.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003151.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003153.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003154.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003155.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003156.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003157.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003158.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003159.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003160.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003161.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003162.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003163.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003164.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003165.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003166.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003167.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003168.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003169.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003170.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003171.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003172.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003173.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003174.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003176.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003177.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003179.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003180.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003181.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003182.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003183.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003184.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003185.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003186.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003187.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003188.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003189.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003190.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003191.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003192.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003193.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003194.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003195.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003196.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003197.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003198.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003199.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003201.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003202.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003203.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003204.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003205.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003206.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003207.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003208.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003209.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003210.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003211.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003212.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003213.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003214.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003215.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003216.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003217.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003218.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003219.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003587.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003588.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003589.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003590.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003591.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003592.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003593.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003594.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003595.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003596.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003597.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003598.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003599.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003600.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003601.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003602.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003603.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003604.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003605.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q003607.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009362.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009363.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009364.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009366.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009368.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009374.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009375.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009376.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009377.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009378.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009380.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009381.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009382.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009383.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009386.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009390.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009391.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009392.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009393.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009394.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009395.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009396.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009397.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009398.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009401.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009402.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009403.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009404.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009405.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009406.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009408.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009409.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009410.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009412.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009414.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009415.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009416.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009420.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009421.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009423.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009424.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009425.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009426.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009429.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009430.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009431.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009432.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009433.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009434.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009436.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009437.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009438.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009439.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009440.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009444.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009447.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009448.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009453.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009457.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009463.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009470.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009471.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009504.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009505.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009506.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009507.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009508.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009509.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009510.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009511.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009512.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009513.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009514.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009515.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009516.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009517.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009518.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009519.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009520.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009521.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009522.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009523.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009524.json failed because of 'cdl'\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009525.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009526.json failed because of list index out of range\n",
"In file jsons_unzipped/eisl\\corpusjson\\Q009543.json failed because of 'cdl'\n",
"In file jsons_unzipped/hbtin\\corpusjson\\P342482.json failed because of 'cdl'\n",
"In file jsons_unzipped/hbtin/corpusjson\\P342482.json failed because of 'cdl'\n",
"In file jsons_unzipped/lacost\\corpusjson\\P226580.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost\\corpusjson\\P281779.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost\\corpusjson\\P432130.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost\\corpusjson\\P464355.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost\\corpusjson\\P464358.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost/corpusjson\\P226580.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost/corpusjson\\P281779.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost/corpusjson\\P432130.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost/corpusjson\\P464355.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/lacost/corpusjson\\P464358.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005390.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005403.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005406.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005437.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005480.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005491.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005496.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005529.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005530.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005539.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005546.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005547.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005548.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005549.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005553.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005590.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q005596.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q009187.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\babylon7\\corpusjson\\Q009237.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\sources\\corpusjson\\P269890.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\sources\\corpusjson\\P269948.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\sources\\corpusjson\\P269981.json failed because of 'cdl'\n",
"In file jsons_unzipped/ribo\\sources\\corpusjson\\P269982.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003442.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003446.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003447.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003614.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003615.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003616.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003631.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap1\\corpusjson\\Q003632.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap2\\corpusjson\\Q006502.json failed because of list index out of range\n",
"In file jsons_unzipped/rinap\\rinap2\\corpusjson\\Q006539.json failed because of list index out of range\n",
"In file jsons_unzipped/rinap\\rinap2\\corpusjson\\Q006543.json failed because of list index out of range\n",
"In file jsons_unzipped/rinap\\rinap2\\corpusjson\\Q006632.json failed because of list index out of range\n",
"In file jsons_unzipped/rinap\\rinap2\\corpusjson\\Q006635.json failed because of list index out of range\n",
"In file jsons_unzipped/rinap\\rinap2\\corpusjson\\Q006641.json failed because of list index out of range\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003480.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003488.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003492.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003507.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003522.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003543.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003544.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003545.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003546.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003551.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003555.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003575.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003988.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003994.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q003997.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004011.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004012.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004013.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004022.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004025.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004026.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004057.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004069.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap3\\corpusjson\\Q004076.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap4\\corpusjson\\Q003296.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap4\\corpusjson\\Q003301.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap4\\corpusjson\\Q003318.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap4\\corpusjson\\Q003331.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap4\\corpusjson\\Q003402.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap5\\corpusjson\\Q003729.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\rinap5\\corpusjson\\Q003747.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P225251.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P247873.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P390298.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P400488.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P427837.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P427844.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P427848.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P427889.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P427890.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P427980.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P428607.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450194.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450195.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450196.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450217.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450218.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450234.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450239.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450240.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450241.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450256.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450257.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450258.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450285.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450286.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450305.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450330.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450335.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450360.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450374.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450379.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450400.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450438.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450439.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450440.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450449.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450450.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450451.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450452.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450453.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450454.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450455.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450456.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450457.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450465.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450466.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450469.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450474.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450479.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450483.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450493.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450499.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450517.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450519.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P450521.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466653.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466657.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466661.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466682.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466683.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466684.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466685.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466687.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466691.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466703.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466704.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466706.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466711.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466713.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466719.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466720.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466729.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466735.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466748.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466755.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466756.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466759.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466778.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466790.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466799.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466803.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466848.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466851.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466852.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466863.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466867.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466869.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466870.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466871.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466872.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466873.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466874.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466875.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466876.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466877.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466878.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466879.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466880.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466882.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466883.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466884.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466886.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466887.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466899.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466900.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466901.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466902.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466903.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466904.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466905.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466906.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466907.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466908.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466909.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466910.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466911.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466912.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466913.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466914.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466915.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466916.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466917.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466918.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466919.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466920.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466921.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466924.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466925.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466926.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466927.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466929.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466935.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466938.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466942.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466944.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466947.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466955.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466956.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466966.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466967.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466968.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466969.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466977.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466987.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466988.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466992.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466993.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466994.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466997.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P466998.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467000.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467004.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467005.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467006.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467008.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467012.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467017.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467024.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467033.json failed because of 'cdl'\n",
"In file jsons_unzipped/rinap\\sources\\corpusjson\\P467034.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa01\\corpusjson\\X010028.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa04\\corpusjson\\P238071.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa04\\corpusjson\\P238827.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa04\\corpusjson\\P336102.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa04\\corpusjson\\P336123.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa07\\corpusjson\\P335792.json failed because of 'NoneType' object is not subscriptable\n",
"In file jsons_unzipped/saao\\saa08\\corpusjson\\X080510.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa12\\corpusjson\\P336269.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa15\\corpusjson\\X150335.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa18\\corpusjson\\X180026.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saa18\\corpusjson\\X238658.json failed because of 'cdl'\n",
"In file jsons_unzipped/saao\\saas2\\corpusjson\\P424508.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu\\corpusjson\\Q006231.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu\\corpusjson\\Q006232.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu\\corpusjson\\Q006234.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu\\corpusjson\\Q006235.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu\\corpusjson\\Q006236.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu\\corpusjson\\Q006237.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu/corpusjson\\Q006231.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu/corpusjson\\Q006232.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu/corpusjson\\Q006234.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu/corpusjson\\Q006235.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu/corpusjson\\Q006236.json failed because of 'cdl'\n",
"In file jsons_unzipped/suhu/corpusjson\\Q006237.json failed because of 'cdl'\n"
]
}
],
"source": [
"# get_grammatical_transliteration_from_json()\n",
"get_grammatical_transliteration_from_json_of_all_data()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Legacy and experimental"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def run_this_other_shit() -> None:\n",
" with open('missing_list.txt') as list:\n",
" for file in list.readlines():\n",
" download_text(file[:-1])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": []
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [],
"source": [
"conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
"conn = psycopg2.connect(dbname='dh', user='dh', password='qwerty', host='dh.saret.tk', port='5432', sslmode='require')\n",
"connection = conn.cursor()\n",
"with open('project_list') as f:\n",
" for project in f.read().split('\\n'):\n",
@@ -122,7 +811,7 @@
"metadata": {},
"outputs": [],
"source": [
"conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
"conn = psycopg2.connect(dbname='dh', user='dh', password='qwerty', host='dh.saret.tk', port='5432', sslmode='require')\n",
"connection = conn.cursor()\n",
"logging.basicConfig(\n",
" level=logging.INFO, filename=f'./now.log',\n",
@@ -150,7 +839,7 @@
"outputs": [],
"source": [
"# with open('cams') as file:\n",
"conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
"conn = psycopg2.connect(dbname='dh', user='dh', password='qwerty', host='dh.saret.tk', port='5432', sslmode='require')\n",
"connection = conn.cursor()\n",
"scrap = scrapping.get_raw_english_texts_of_project('saao')\n",
"execute_batch(\n",
@@ -86140,26 +86829,6 @@
"# download_text(r\"C:\\Users\\Saret\\Neutral Folder\\dh\\saao\\saa03\\corpusjson\\P336604.json\")\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def create_missing_list():\n",
" conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
" connection = conn.cursor()\n",
" with open(\"project_list\") as file:\n",
" files = \"\\n\".join(['\\n'.join(get_project_files(project[:-1])) for project in file.readlines()])\n",
" with open(\"missing_list.txt\", \"w\") as missing_list:\n",
" connection.execute('select id_text from raw_texts')\n",
" ids = [nod[0] for nod in connection.fetchall()]\n",
" for _id in ids:\n",
" files = re.sub(f'jsons_unz.+{_id}.+\\n+', '', files)\n",
" logging.info(files.count('n\\n'))\n",
" missing_list.write(files)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
@@ -86169,25 +86838,13 @@
"create_missing_list()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"run_this_other_shit()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def run_this_other_shit() -> None:\n",
" with open('missing_list.txt') as list:\n",
" for file in list.readlines():\n",
" download_text(file[:-1])"
"run_this_other_shit()"
]
},
{

View File

@@ -31,6 +31,7 @@ def _load_json_from_path(json_path: str) -> Dict:
if os.stat(json_path).st_size != 0: # If the file is not empty:
return json.load(json_file)
def _download_data_from_website(url: str) -> ResultSet:
try:
res = requests.get(url)
@@ -38,12 +39,14 @@ def _download_data_from_website(url: str) -> ResultSet:
return soup.find_all("span", {"class": "cell"})
except Exception as e:
logging.error(e)
return ResultSet()
return list()
def _clean_raw_text(results: ResultSet)->str:
def _clean_raw_text(results: ResultSet) -> str:
return " ".join(["".join([content if isinstance(content, str) else content.text
for content in result.contents]) for result in results]).replace('\n', ' ')
def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
raw_jsons = list()
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True) + glob.glob(
@@ -139,13 +142,13 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
:return: A list of jsons containing the raw texts of the given project and basic metadata.
"""
raw_jsons = list()
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)+glob.glob(
f'jsons_unzipped/{project_dirname}/corpusjson/*.json', recursive=True)
for filename in all_paths:
cur_json = _load_json_from_path(filename)
try:
project_name = cur_json['project']
sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
except Exception as e:
print(f"In file {filename} failed because of {e}")
@@ -154,8 +157,7 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
raw_text = get_raw_akk_text_from_json(sents_dicts)
raw_jsons.append({
"id_text": cur_json['textid'],
"project_name": project_name,
"raw_text": raw_text,
"raw_text": raw_text
})
# if not texts_jsons or not texts_jsons.get('members'):
@@ -209,7 +211,7 @@ def _get_raw_text(json_dict: dict) -> str:
raw_texts.extend(_get_raw_text(d['cdl']).split())
elif _is_word(d): # If node represents a word
if previous_ref != d.get('ref'): # If encountered new instance:
cur_text = d['frag'] if d.get('frag') else d['f']['form']
cur_text = d['f']['norm'] if d['f'].get('norm') else d['f']['form']
raw_texts.append(cur_text + _get_addition(d))
previous_ref = d.get('ref')