finished scrapping all the data
This commit is contained in:
28
scrapping.py
28
scrapping.py
@@ -1,7 +1,7 @@
|
||||
import json
|
||||
from typing import Dict, List
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4 import BeautifulSoup, ResultSet
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
@@ -31,6 +31,18 @@ def _load_json_from_path(json_path: str) -> Dict:
|
||||
if os.stat(json_path).st_size != 0: # If the file is not empty:
|
||||
return json.load(json_file)
|
||||
|
||||
def _download_data_from_website(url: str) -> ResultSet:
|
||||
try:
|
||||
res = requests.get(url)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
return soup.find_all("span", {"class": "cell"})
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
return ResultSet()
|
||||
|
||||
def _clean_raw_text(results: ResultSet)->str:
|
||||
return " ".join(["".join([content if isinstance(content, str) else content.text
|
||||
for content in result.contents]) for result in results]).replace('\n', ' ')
|
||||
|
||||
def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
|
||||
raw_jsons = list()
|
||||
@@ -44,8 +56,11 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
|
||||
# for member in d.get('members').values():
|
||||
for filename in all_paths:
|
||||
cur_json = _load_json_from_path(filename)
|
||||
project_name = cur_json['project']
|
||||
|
||||
try:
|
||||
project_name = cur_json['project']
|
||||
except TypeError:
|
||||
logging.error(f"Error in {filename}")
|
||||
continue
|
||||
# # Skip in case we are in saa project and the current sub project is not in neo-assyrian
|
||||
# if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate
|
||||
# continue
|
||||
@@ -56,12 +71,7 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
|
||||
# print(url)
|
||||
logging.info(url)
|
||||
try:
|
||||
res = requests.get(url)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
results = soup.find_all("span", {"class": "cell"})
|
||||
raw_text = " ".join(["".join([content if isinstance(content, str) else content.text
|
||||
for content in result.contents]) for result in results])
|
||||
raw_text = raw_text.replace('\n', ' ')
|
||||
raw_text = _clean_raw_text(_download_data_from_website(url))
|
||||
if raw_text:
|
||||
raw_jsons.append({
|
||||
"id_text": cur_json['textid'],
|
||||
|
Reference in New Issue
Block a user