finished scrapping all the data

2023-04-12 22:05:16 +03:00
parent df548fa29d
commit 218a3d8135
10 changed files with 90845 additions and 22884 deletions
--- a/scrapping.py
+++ b/scrapping.py
@@ -1,7 +1,7 @@
 import json
 from typing import Dict, List
 import requests
-from bs4 import BeautifulSoup
+from bs4 import BeautifulSoup, ResultSet
 import os
 from pathlib import Path
 import re
@@ -31,6 +31,18 @@ def _load_json_from_path(json_path: str) -> Dict:
        if os.stat(json_path).st_size != 0:  # If the file is not empty:
            return json.load(json_file)

+def _download_data_from_website(url: str) -> ResultSet:
+    try:
+        res = requests.get(url)
+        soup = BeautifulSoup(res.text, "html.parser")
+        return soup.find_all("span", {"class": "cell"})
+    except Exception as e:
+        logging.error(e)
+        return ResultSet()
+    
+def _clean_raw_text(results: ResultSet)->str:
+    return " ".join(["".join([content if isinstance(content, str) else content.text
+                              for content in result.contents]) for result in results]).replace('\n', ' ')

 def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
    raw_jsons = list()
@@ -44,8 +56,11 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
    # for member in d.get('members').values():
    for filename in all_paths:
        cur_json = _load_json_from_path(filename)
-        project_name = cur_json['project']
-
+        try:
+            project_name = cur_json['project']
+        except TypeError:
+            logging.error(f"Error in {filename}")
+            continue
        # # Skip in case we are in saa project and the current sub project is not in neo-assyrian
        # if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
        #     continue
@@ -56,12 +71,7 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
        # print(url)
        logging.info(url)
        try:
-            res = requests.get(url)
-            soup = BeautifulSoup(res.text, "html.parser")
-            results = soup.find_all("span", {"class": "cell"})
-            raw_text = " ".join(["".join([content if isinstance(content, str) else content.text
-                                        for content in result.contents]) for result in results])
-            raw_text = raw_text.replace('\n', ' ')
+            raw_text = _clean_raw_text(_download_data_from_website(url))
            if raw_text:
                raw_jsons.append({
                    "id_text": cur_json['textid'],