finished scrapping all the data

This commit is contained in:
2023-04-12 22:05:16 +03:00
parent df548fa29d
commit 218a3d8135
10 changed files with 90845 additions and 22884 deletions

View File

@@ -1,7 +1,7 @@
import json
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, ResultSet
import os
from pathlib import Path
import re
@@ -31,6 +31,18 @@ def _load_json_from_path(json_path: str) -> Dict:
if os.stat(json_path).st_size != 0: # If the file is not empty:
return json.load(json_file)
def _download_data_from_website(url: str) -> ResultSet:
try:
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
return soup.find_all("span", {"class": "cell"})
except Exception as e:
logging.error(e)
return ResultSet()
def _clean_raw_text(results: ResultSet)->str:
return " ".join(["".join([content if isinstance(content, str) else content.text
for content in result.contents]) for result in results]).replace('\n', ' ')
def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
raw_jsons = list()
@@ -44,8 +56,11 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
# for member in d.get('members').values():
for filename in all_paths:
cur_json = _load_json_from_path(filename)
project_name = cur_json['project']
try:
project_name = cur_json['project']
except TypeError:
logging.error(f"Error in {filename}")
continue
# # Skip in case we are in saa project and the current sub project is not in neo-assyrian
# if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate
# continue
@@ -56,12 +71,7 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
# print(url)
logging.info(url)
try:
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
results = soup.find_all("span", {"class": "cell"})
raw_text = " ".join(["".join([content if isinstance(content, str) else content.text
for content in result.contents]) for result in results])
raw_text = raw_text.replace('\n', ' ')
raw_text = _clean_raw_text(_download_data_from_website(url))
if raw_text:
raw_jsons.append({
"id_text": cur_json['textid'],