update the progress

This commit is contained in:
2023-04-17 02:51:50 +03:00
parent a9e93bd99f
commit 89ec3a3578
4 changed files with 4831 additions and 57 deletions

View File

@@ -31,6 +31,7 @@ def _load_json_from_path(json_path: str) -> Dict:
if os.stat(json_path).st_size != 0: # If the file is not empty:
return json.load(json_file)
def _download_data_from_website(url: str) -> ResultSet:
try:
res = requests.get(url)
@@ -38,12 +39,14 @@ def _download_data_from_website(url: str) -> ResultSet:
return soup.find_all("span", {"class": "cell"})
except Exception as e:
logging.error(e)
return ResultSet()
def _clean_raw_text(results: ResultSet)->str:
return list()
def _clean_raw_text(results: ResultSet) -> str:
return " ".join(["".join([content if isinstance(content, str) else content.text
for content in result.contents]) for result in results]).replace('\n', ' ')
def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
raw_jsons = list()
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True) + glob.glob(
@@ -139,13 +142,13 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
:return: A list of jsons containing the raw texts of the given project and basic metadata.
"""
raw_jsons = list()
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)+glob.glob(
f'jsons_unzipped/{project_dirname}/corpusjson/*.json', recursive=True)
for filename in all_paths:
cur_json = _load_json_from_path(filename)
try:
project_name = cur_json['project']
sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
except Exception as e:
print(f"In file {filename} failed because of {e}")
@@ -154,8 +157,7 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
raw_text = get_raw_akk_text_from_json(sents_dicts)
raw_jsons.append({
"id_text": cur_json['textid'],
"project_name": project_name,
"raw_text": raw_text,
"raw_text": raw_text
})
# if not texts_jsons or not texts_jsons.get('members'):
@@ -209,7 +211,7 @@ def _get_raw_text(json_dict: dict) -> str:
raw_texts.extend(_get_raw_text(d['cdl']).split())
elif _is_word(d): # If node represents a word
if previous_ref != d.get('ref'): # If encountered new instance:
cur_text = d['frag'] if d.get('frag') else d['f']['form']
cur_text = d['f']['norm'] if d['f'].get('norm') else d['f']['form']
raw_texts.append(cur_text + _get_addition(d))
previous_ref = d.get('ref')