Files
DH/project_notebook.ipynb
2023-04-12 08:24:03 +03:00

6.9 KiB

Imported modules

In [1]:
import json
import scrapping
import psycopg2
from psycopg2.extras import execute_batch
import re
import logging
import datetime
logging.basicConfig(
    level=logging.INFO, filename=f'{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log', filemode='w',
    format='%(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
# from psycopg2.extensions import register_adapter
In [3]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
connection = conn.cursor()
with open('project_list') as f:
    for project in f.read().split('\n'):
        # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
        try:
            scrap = scrapping.get_raw_english_texts_of_project(project)
        except Exception:
            logging.error(f"Error in {project}:{Exception}")
        try:
            execute_batch(connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
            conn.commit()
        except Exception:
            with open(project, 'w') as f:
                f.write(json.dumps(scrap))
            logging.error(f"Error in {project}:{Exception}")
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[3], line 6
      3 with open('project_list') as f:
      4     for project in f.read().split('\n'):
      5         # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
----> 6         scrap = scrapping.get_raw_english_texts_of_project(project)
      7         try:
      8             execute_batch(connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)

File c:\Users\Saret\Programming\C#\DH\scrapping.py:46, in get_raw_english_texts_of_project(project_dirname)
     44 for filename in all_paths:
     45     cur_json = _load_json_from_path(filename)
---> 46     project_name = cur_json['project']
     48     # # Skip in case we are in saa project and the current sub project is not in neo-assyrian
     49     # if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
     50     #     continue
     51 
     52     # id_text = member.get('id_text', "") + member.get('id_composite', "")
     53     # html_dir = "/".join(path.parts[1:-1])
     54     url = f"http://oracc.iaas.upenn.edu/{project_name}/{cur_json['textid']}/html"

TypeError: 'NoneType' object is not subscriptable
In [33]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
connection = conn.cursor()
logging.basicConfig(
    level=logging.INFO, filename=f'./now.log',
    format='%(name)s - %(levelname)s - %(message)s')
logging.info('Start')
with open('project_list') as f:
    project = f.read().split('\n')[2]
    scrap = scrapping.get_raw_english_texts_of_project(project)
    execute_batch(
        connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING",
        scrap)
    conn.commit()
In [25]:
with open('cams') as file:
    conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
    connection = conn.cursor()
    execute_batch(
        connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING",
        json.load(file))
    conn.commit()
In [ ]:
 
In [ ]: