7.7 KiB
7.7 KiB
Imported modules¶
In [1]:
import json import scrapping import psycopg2 from psycopg2.extras import execute_batch import re import logging import datetime import os logging.basicConfig( level=logging.INFO, filename=f'{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', datefmt='%d-%b-%y %H:%M:%S') # from psycopg2.extensions import register_adapter
In [29]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'") connection = conn.cursor() with open('project_list') as f: for project in f.read().split('\n'): # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap) try: scrap = scrapping.get_raw_english_texts_of_project(project) except Exception: logging.error(f"Error in {project}:{Exception}") try: execute_batch( connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING", scrap) conn.commit() except Exception: with open(project, 'w') as f: f.write(json.dumps(scrap)) logging.error(f"Error in {project}:{Exception}")
In [3]:
conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'") connection = conn.cursor() logging.basicConfig( level=logging.INFO, filename=f'./now.log', format='%(name)s - %(levelname)s - %(message)s') logging.info('Start') with open('project_list', encoding='unicode-escape') as f: # project = f.read().split('\n')[2] for project in f.read().split('\n'): # connection = connection.execute("insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)", scrap) if os.path.exists(project): with open(project) as f: logging.info(f'reading {project}') scrap = json.loads(f.read()) # scrap = json.loads(project) execute_batch( connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING", scrap) conn.commit()
In [4]:
# with open('cams') as file: conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'") connection = conn.cursor() scrap = scrapping.get_raw_english_texts_of_project('saao') execute_batch( connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING", scrap) conn.commit()
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) Cell In[4], line 4 2 conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'") 3 connection = conn.cursor() ----> 4 scrap = scrapping.get_raw_english_texts_of_project('saao') 5 execute_batch( 6 connection, "insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s) ON CONFLICT DO NOTHING", 7 scrap) 8 conn.commit() File c:\Users\Saret\Programming\C#\DH\scrapping.py:46, in get_raw_english_texts_of_project(project_dirname, oracc_site) 44 for filename in all_paths: 45 cur_json = _load_json_from_path(filename) ---> 46 project_name = cur_json['project'] 48 # # Skip in case we are in saa project and the current sub project is not in neo-assyrian 49 # if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate 50 # continue 51 52 # id_text = member.get('id_text', "") + member.get('id_composite', "") 53 # html_dir = "/".join(path.parts[1:-1]) 54 url = f"http://{oracc_site}/{project_name}/{cur_json['textid']}/html" TypeError: 'NoneType' object is not subscriptable
In [ ]:
In [ ]: