Files
DH/project_notebook.ipynb
2023-04-12 01:37:57 +03:00

6.0 KiB

Imported modules

In [30]:
import json
import scrapping
import psycopg2
from psycopg2.extras import execute_batch
import re
import logging
import datetime
logging.basicConfig(
    level=logging.INFO, filename=f'{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log', filemode='w',
    format='%(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
# from psycopg2.extensions import register_adapter
In [17]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
connection = conn.cursor()
with open('project_list') as f:
    for project in f.read().split('\n'):
        scrap = scrapping.get_raw_english_texts_of_project(project)
        # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
        try:
            execute_batch(connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
            conn.commit()
        except Exception:
            with open(project, 'w') as f:
                f.write(json.dumps(scrap))
            logging.error(f"Error in {project}:{Exception}")
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[17], line 5
      3 with open('project_list') as f:
      4     for project in f.read().split('\n'):
----> 5         scrap = scrapping.get_raw_english_texts_of_project(project)
      6         # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
      7         try:

File c:\Users\Saret\Programming\C#\DH\scrapping.py:43, in get_raw_english_texts_of_project(project_dirname)
     41 for filename in all_paths:
     42     cur_json = _load_json_from_path(filename)
---> 43     project_name = cur_json['project']
     45     # # Skip in case we are in saa project and the current sub project is not in neo-assyrian
     46     # if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
     47     #     continue
     48 
     49     # id_text = member.get('id_text', "") + member.get('id_composite', "")
     50     # html_dir = "/".join(path.parts[1:-1])
     51     url = f"http://oracc.iaas.upenn.edu/{project_name}/{cur_json['textid']}/html"

TypeError: 'NoneType' object is not subscriptable
In [33]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
connection = conn.cursor()
logging.basicConfig(
    level=logging.INFO, filename=f'./now.log',
    format='%(name)s - %(levelname)s - %(message)s')
logging.info('Start')
with open('project_list') as f:
    project = f.read().split('\n')[2]
    scrap = scrapping.get_raw_english_texts_of_project(project)
    execute_batch(connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
    conn.commit()
In [44]:

In [ ]:
 
In [ ]: