update the progress

2023-04-17 02:51:50 +03:00
parent a9e93bd99f
commit 89ec3a3578
4 changed files with 4831 additions and 57 deletions
--- a/scrapping.py
+++ b/scrapping.py
@@ -31,6 +31,7 @@ def _load_json_from_path(json_path: str) -> Dict:
        if os.stat(json_path).st_size != 0:  # If the file is not empty:
            return json.load(json_file)

+
 def _download_data_from_website(url: str) -> ResultSet:
    try:
        res = requests.get(url)
@@ -38,12 +39,14 @@ def _download_data_from_website(url: str) -> ResultSet:
        return soup.find_all("span", {"class": "cell"})
    except Exception as e:
        logging.error(e)
-        return ResultSet()
-    
-def _clean_raw_text(results: ResultSet)->str:
+        return list()
+
+
+def _clean_raw_text(results: ResultSet) -> str:
    return " ".join(["".join([content if isinstance(content, str) else content.text
                              for content in result.contents]) for result in results]).replace('\n', ' ')

+
 def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
    raw_jsons = list()
    all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True) + glob.glob(
@@ -139,13 +142,13 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
    :return: A list of jsons containing the raw texts of the given project and basic metadata.
    """
    raw_jsons = list()
-    all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
+    all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)+glob.glob(
+        f'jsons_unzipped/{project_dirname}/corpusjson/*.json', recursive=True)

    for filename in all_paths:
        cur_json = _load_json_from_path(filename)

        try:
-            project_name = cur_json['project']
            sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
        except Exception as e:
            print(f"In file {filename} failed because of {e}")
@@ -154,8 +157,7 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
        raw_text = get_raw_akk_text_from_json(sents_dicts)
        raw_jsons.append({
            "id_text": cur_json['textid'],
-            "project_name": project_name,
-            "raw_text": raw_text,
+            "raw_text": raw_text
        })

    # if not texts_jsons or not texts_jsons.get('members'):
@@ -209,7 +211,7 @@ def _get_raw_text(json_dict: dict) -> str:
            raw_texts.extend(_get_raw_text(d['cdl']).split())
        elif _is_word(d):  # If node represents a word
            if previous_ref != d.get('ref'):  # If encountered new instance:
-                cur_text = d['frag'] if d.get('frag') else d['f']['form']
+                cur_text = d['f']['norm'] if d['f'].get('norm') else d['f']['form']
                raw_texts.append(cur_text + _get_addition(d))
                previous_ref = d.get('ref')