updates in oracc link; reading project that failed

This commit is contained in:
2023-04-12 11:23:35 +03:00
parent 5b071fbac3
commit 15c9b56fd0
29 changed files with 7957 additions and 31 deletions

View File

@@ -32,7 +32,7 @@ def _load_json_from_path(json_path: str) -> Dict:
return json.load(json_file)
def get_raw_english_texts_of_project(project_dirname: str) -> List[Dict]:
def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
raw_jsons = list()
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
# path = Path(os.path.join(JSONS_DIR, project_dirname, 'catalogue.json'))
@@ -51,7 +51,7 @@ def get_raw_english_texts_of_project(project_dirname: str) -> List[Dict]:
# id_text = member.get('id_text', "") + member.get('id_composite', "")
# html_dir = "/".join(path.parts[1:-1])
url = f"http://oracc.iaas.upenn.edu/{project_name}/{cur_json['textid']}/html"
url = f"http://{oracc_site}/{project_name}/{cur_json['textid']}/html"
# print(url)
logging.info(url)
try: