Files
DH/project_notebook.ipynb
2023-04-12 12:29:22 +03:00

7.7 KiB

Imported modules

In [1]:
import json
import scrapping
import psycopg2
from psycopg2.extras import execute_batch
import re
import logging
import datetime
import os
logging.basicConfig(
    level=logging.INFO, filename=f'{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log', filemode='w',
    format='%(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S')
# from psycopg2.extensions import register_adapter
In [29]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
connection = conn.cursor()
with open('project_list') as f:
    for project in f.read().split('\n'):
        # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
        try:
            scrap = scrapping.get_raw_english_texts_of_project(project)
        except Exception:
            logging.error(f"Error in {project}:{Exception}")
        try:
            execute_batch(
                connection,
                "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING",
                scrap)
            conn.commit()
        except Exception:
            with open(project, 'w') as f:
                f.write(json.dumps(scrap))
            logging.error(f"Error in {project}:{Exception}")
In [3]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
connection = conn.cursor()
logging.basicConfig(
    level=logging.INFO, filename=f'./now.log',
    format='%(name)s - %(levelname)s - %(message)s')
logging.info('Start')
with open('project_list', encoding='unicode-escape') as f:
    # project = f.read().split('\n')[2]
    for project in f.read().split('\n'):
        # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap)
        if os.path.exists(project):
            with open(project) as f:
                logging.info(f'reading {project}')
                scrap = json.loads(f.read())
            # scrap = json.loads(project)
                execute_batch(
                    connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING",
                    scrap)
                conn.commit()
In [4]:
# with open('cams') as file:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
connection = conn.cursor()
scrap = scrapping.get_raw_english_texts_of_project('saao')
execute_batch(
    connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING",
    scrap)
conn.commit()
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
Cell In[4], line 4
      2 conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
      3 connection = conn.cursor()
----> 4 scrap = scrapping.get_raw_english_texts_of_project('saao')
      5 execute_batch(
      6     connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING",
      7     scrap)
      8 conn.commit()

File c:\Users\Saret\Programming\C#\DH\scrapping.py:46, in get_raw_english_texts_of_project(project_dirname, oracc_site)
     44 for filename in all_paths:
     45     cur_json = _load_json_from_path(filename)
---> 46     project_name = cur_json['project']
     48     # # Skip in case we are in saa project and the current sub project is not in neo-assyrian
     49     # if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
     50     #     continue
     51 
     52     # id_text = member.get('id_text', "") + member.get('id_composite', "")
     53     # html_dir = "/".join(path.parts[1:-1])
     54     url = f"http://{oracc_site}/{project_name}/{cur_json['textid']}/html"

TypeError: 'NoneType' object is not subscriptable
In [ ]:
 
In [ ]: