update the progress
This commit is contained in:
18
scrapping.py
18
scrapping.py
@@ -31,6 +31,7 @@ def _load_json_from_path(json_path: str) -> Dict:
|
||||
if os.stat(json_path).st_size != 0: # If the file is not empty:
|
||||
return json.load(json_file)
|
||||
|
||||
|
||||
def _download_data_from_website(url: str) -> ResultSet:
|
||||
try:
|
||||
res = requests.get(url)
|
||||
@@ -38,12 +39,14 @@ def _download_data_from_website(url: str) -> ResultSet:
|
||||
return soup.find_all("span", {"class": "cell"})
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
return ResultSet()
|
||||
|
||||
def _clean_raw_text(results: ResultSet)->str:
|
||||
return list()
|
||||
|
||||
|
||||
def _clean_raw_text(results: ResultSet) -> str:
|
||||
return " ".join(["".join([content if isinstance(content, str) else content.text
|
||||
for content in result.contents]) for result in results]).replace('\n', ' ')
|
||||
|
||||
|
||||
def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
|
||||
raw_jsons = list()
|
||||
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True) + glob.glob(
|
||||
@@ -139,13 +142,13 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
|
||||
:return: A list of jsons containing the raw texts of the given project and basic metadata.
|
||||
"""
|
||||
raw_jsons = list()
|
||||
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
|
||||
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)+glob.glob(
|
||||
f'jsons_unzipped/{project_dirname}/corpusjson/*.json', recursive=True)
|
||||
|
||||
for filename in all_paths:
|
||||
cur_json = _load_json_from_path(filename)
|
||||
|
||||
try:
|
||||
project_name = cur_json['project']
|
||||
sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
|
||||
except Exception as e:
|
||||
print(f"In file {filename} failed because of {e}")
|
||||
@@ -154,8 +157,7 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
|
||||
raw_text = get_raw_akk_text_from_json(sents_dicts)
|
||||
raw_jsons.append({
|
||||
"id_text": cur_json['textid'],
|
||||
"project_name": project_name,
|
||||
"raw_text": raw_text,
|
||||
"raw_text": raw_text
|
||||
})
|
||||
|
||||
# if not texts_jsons or not texts_jsons.get('members'):
|
||||
@@ -209,7 +211,7 @@ def _get_raw_text(json_dict: dict) -> str:
|
||||
raw_texts.extend(_get_raw_text(d['cdl']).split())
|
||||
elif _is_word(d): # If node represents a word
|
||||
if previous_ref != d.get('ref'): # If encountered new instance:
|
||||
cur_text = d['frag'] if d.get('frag') else d['f']['form']
|
||||
cur_text = d['f']['norm'] if d['f'].get('norm') else d['f']['form']
|
||||
raw_texts.append(cur_text + _get_addition(d))
|
||||
previous_ref = d.get('ref')
|
||||
|
||||
|
Reference in New Issue
Block a user