trying to scrape
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
venv/*
|
1
data.jsonl
Normal file
1
data.jsonl
Normal file
File diff suppressed because one or more lines are too long
54
project_list
Normal file
54
project_list
Normal file
@@ -0,0 +1,54 @@
|
||||
adsd
|
||||
aemw
|
||||
akklove
|
||||
amgg
|
||||
ario
|
||||
armep
|
||||
arrim
|
||||
asbp
|
||||
atae
|
||||
babcity
|
||||
blms
|
||||
btmao
|
||||
btto
|
||||
cams
|
||||
caspo
|
||||
ccpo
|
||||
cdli
|
||||
ckst
|
||||
cmawro
|
||||
contrib
|
||||
contrib/amarna
|
||||
contrib/lambert
|
||||
ctij
|
||||
dcclt
|
||||
dccmt
|
||||
doc
|
||||
dsst
|
||||
ecut
|
||||
eisl
|
||||
epsd2
|
||||
etcsri
|
||||
glass
|
||||
hbtin
|
||||
lacost
|
||||
lovelyrics
|
||||
neo
|
||||
nere
|
||||
nimrud
|
||||
obel
|
||||
obmc
|
||||
obta
|
||||
ogsl
|
||||
oimea
|
||||
pnao
|
||||
qcat
|
||||
riao
|
||||
ribo
|
||||
rimanum
|
||||
rinap
|
||||
saao
|
||||
suhu
|
||||
tcma
|
||||
tsae
|
||||
xcat
|
70
project_notebook.ipynb
Normal file
70
project_notebook.ipynb
Normal file
@@ -0,0 +1,70 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Imported modules"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import json\n",
|
||||
"import scrapping\n",
|
||||
"import psycopg2\n",
|
||||
"from psycopg2.extras import execute_batch\n",
|
||||
"# from psycopg2.extensions import register_adapter"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
|
||||
"connection = conn.cursor()\n",
|
||||
"with open('project_list') as f:\n",
|
||||
" for project in f.read().split('\\n'):\n",
|
||||
" scrap = scrapping.get_raw_english_texts_of_project(project)\n",
|
||||
" connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n",
|
||||
" execute_batch(connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n",
|
||||
" conn.commit()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "venv",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
2627
scrape.log
Normal file
2627
scrape.log
Normal file
File diff suppressed because it is too large
Load Diff
9
scrape.py
Normal file
9
scrape.py
Normal file
@@ -0,0 +1,9 @@
|
||||
import urllib3
|
||||
import bs4
|
||||
import psycopg2
|
||||
|
||||
# def connect_postgres():
|
||||
# conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
|
||||
# return conn
|
||||
|
||||
def
|
227
scrapping.py
Normal file
227
scrapping.py
Normal file
@@ -0,0 +1,227 @@
|
||||
import json
|
||||
from typing import Dict, List
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
import os
|
||||
from pathlib import Path
|
||||
import re
|
||||
import glob
|
||||
import logging
|
||||
# parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
# os.chdir(parent_dir)
|
||||
logging.basicConfig(filename='scrape.log', level=logging.INFO)
|
||||
JSONS_DIR = "jsons_unzipped"
|
||||
|
||||
SUB_PROJECTS_IN_NEO_ASS = ["01", "05", "06", "07", "09", "11", "12", "14", "15", "16"]
|
||||
|
||||
CORPUS_DIRNAME = "corpusjson"
|
||||
|
||||
|
||||
def _load_json_from_path(json_path: str) -> Dict:
|
||||
"""
|
||||
This helper function loads a json from a given path, with the exception of empty files.
|
||||
|
||||
:param json_path: A given string representing a path to a json file.
|
||||
:return: The json file (if it is a valid json file).
|
||||
"""
|
||||
with open(json_path, "r", encoding='utf-8') as json_file:
|
||||
if os.stat(json_path).st_size != 0: # If the file is not empty:
|
||||
return json.load(json_file)
|
||||
|
||||
|
||||
def get_raw_english_texts_of_project(project_dirname: str) -> List[Dict]:
|
||||
raw_jsons = list()
|
||||
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
|
||||
# path = Path(os.path.join(JSONS_DIR, project_dirname, 'catalogue.json'))
|
||||
# if not os.path.isfile(path):
|
||||
# return raw_jsons
|
||||
# d = _load_json_to_dict(str(path))
|
||||
# if d and d.get('members'):
|
||||
# for member in d.get('members').values():
|
||||
for filename in all_paths:
|
||||
cur_json = _load_json_from_path(filename)
|
||||
project_name = cur_json['project']
|
||||
|
||||
# # Skip in case we are in saa project and the current sub project is not in neo-assyrian
|
||||
# if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate
|
||||
# continue
|
||||
|
||||
# id_text = member.get('id_text', "") + member.get('id_composite', "")
|
||||
# html_dir = "/".join(path.parts[1:-1])
|
||||
url = f"http://oracc.iaas.upenn.edu/{project_name}/{cur_json['textid']}/html"
|
||||
# print(url)
|
||||
logging.info(url)
|
||||
try:
|
||||
res = requests.get(url)
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
results = soup.find_all("span", {"class": "cell"})
|
||||
raw_text = " ".join(["".join([content if isinstance(content, str) else content.text
|
||||
for content in result.contents]) for result in results])
|
||||
raw_text = raw_text.replace('\n', ' ')
|
||||
if raw_text:
|
||||
raw_jsons.append({
|
||||
"id_text": cur_json['textid'],
|
||||
"project_name": project_name,
|
||||
"raw_text": raw_text,
|
||||
})
|
||||
except Exception as e:
|
||||
logging.error(e)
|
||||
return raw_jsons
|
||||
|
||||
|
||||
def num_words_in_english(jsonl_file):
|
||||
words_counter = 0
|
||||
with open(jsonl_file, "r", encoding='utf-8') as f_in:
|
||||
for line in f_in:
|
||||
cur_json = json.loads(line)
|
||||
if cur_json["project_name"].startswith("saa"):
|
||||
cur_json["raw_text"] = re.sub(r'\([^)]*\)', '', cur_json["raw_text"])
|
||||
words_counter += len(cur_json["raw_text"].split())
|
||||
print(words_counter)
|
||||
|
||||
|
||||
def get_raw_text_akk_from_html(id_text, project_name):
|
||||
# else: # If the file doesn't exist in the jsons -> look for it online
|
||||
url = f'http://oracc.iaas.upenn.edu/{project_name}/{id_text}/html'
|
||||
res = requests.get(url)
|
||||
if res.status_code != 200:
|
||||
print("******STATUS CODE IS NOT 200***********")
|
||||
return ""
|
||||
soup = BeautifulSoup(res.text, "html.parser")
|
||||
# print(f"Check this out:\n{url}")
|
||||
raw_text = _get_raw_text_html(soup)
|
||||
# cur_json = {"id_text": id_text, "project_name": project_name, "raw_text": raw_text}
|
||||
return raw_text
|
||||
|
||||
|
||||
def _get_raw_text_html1(soup):
|
||||
words = soup.find_all("a", class_="cbd")
|
||||
return ' '.join(tag.text for tag in soup.find_all('p', class_='tt'))
|
||||
return ' '.join([word.text for word in words])
|
||||
|
||||
|
||||
def _get_raw_text_html(soup):
|
||||
tags = soup.find_all('span', class_=lambda value: value and value.startswith('w '))
|
||||
signs = list()
|
||||
for tag in tags:
|
||||
temp_tag = tag.find('a')
|
||||
if temp_tag:
|
||||
tag = temp_tag
|
||||
for sign in tag.contents:
|
||||
if isinstance(sign, str):
|
||||
signs.append(sign)
|
||||
elif sign.name == 'span':
|
||||
signs.append(sign.text)
|
||||
elif sign.name == 'sup':
|
||||
signs.append("{" + sign.text + "}")
|
||||
return ' '.join(signs)
|
||||
|
||||
|
||||
def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
|
||||
"""
|
||||
This function parses the raw texts of a project in ORACC.
|
||||
|
||||
:param project_dirname: A given string representing the path to the project's directory.
|
||||
:return: A list of jsons containing the raw texts of the given project and basic metadata.
|
||||
"""
|
||||
raw_jsons = list()
|
||||
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
|
||||
|
||||
for filename in all_paths:
|
||||
cur_json = _load_json_from_path(filename)
|
||||
|
||||
try:
|
||||
project_name = cur_json['project']
|
||||
sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
|
||||
except Exception as e:
|
||||
print(f"In file {filename} failed because of {e}")
|
||||
continue
|
||||
|
||||
raw_text = get_raw_akk_text_from_json(sents_dicts)
|
||||
raw_jsons.append({
|
||||
"id_text": cur_json['textid'],
|
||||
"project_name": project_name,
|
||||
"raw_text": raw_text,
|
||||
})
|
||||
|
||||
# if not texts_jsons or not texts_jsons.get('members'):
|
||||
# return raw_jsons
|
||||
# for member in texts_jsons.get('members').values(): # Iterate over different tablets:
|
||||
# project_name = member['project'].split("/")[-1]
|
||||
#
|
||||
# # Skip in case we are in saa project and the current sub project is not in neo-assyrian
|
||||
# if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate
|
||||
# continue
|
||||
#
|
||||
# id_text = member.get("id_text", "") + member.get("id_composite", "")
|
||||
# json_file_path = os.path.join(JSONS_DIR, Path(member['project']), CORPUS_DIRNAME, f'{id_text}.json')
|
||||
#
|
||||
# if os.path.isfile(json_file_path): # If file exists in the jsons
|
||||
# d = _load_json_to_dict(json_file_path)
|
||||
#
|
||||
# try:
|
||||
# sents_dicts = d['cdl'][0]['cdl'][-1]['cdl']
|
||||
# except Exception as e:
|
||||
# print(f"In file {json_file_path} failed because of {e}")
|
||||
# continue
|
||||
#
|
||||
# raw_text = " ".join([_get_raw_text(sent_dict['cdl']) for sent_dict in sents_dicts if _is_sent(sent_dict)])
|
||||
# cur_json = {
|
||||
# "id_text": id_text,
|
||||
# "project_name": project_name,
|
||||
# "raw_text": raw_text,
|
||||
# }
|
||||
# raw_jsons.append(cur_json)
|
||||
|
||||
return raw_jsons
|
||||
|
||||
|
||||
def get_raw_akk_text_from_json(sents_dicts):
|
||||
return " ".join([_get_raw_text(sent_dict['cdl']) for sent_dict in sents_dicts if _is_sent(sent_dict)])
|
||||
|
||||
|
||||
def _get_raw_text(json_dict: dict) -> str:
|
||||
"""
|
||||
This function gets the raw text of a given transliterated tablet in ORACC recursively.
|
||||
It appends each instance in the tablet only once (even if there are multiple possible meanings).
|
||||
|
||||
:param json_dict: A given dictionary representing some portion of the words in the tablet.
|
||||
:return: The aforementioned raw text.
|
||||
"""
|
||||
previous_ref: str = ""
|
||||
raw_texts = list()
|
||||
for d in json_dict:
|
||||
if _is_sent(d): # If node represents a sentence -> call recursively to the inner dictionary
|
||||
raw_texts.extend(_get_raw_text(d['cdl']).split())
|
||||
elif _is_word(d): # If node represents a word
|
||||
if previous_ref != d.get('ref'): # If encountered new instance:
|
||||
cur_text = d['frag'] if d.get('frag') else d['f']['form']
|
||||
raw_texts.append(cur_text + _get_addition(d))
|
||||
previous_ref = d.get('ref')
|
||||
|
||||
return " ".join(raw_texts)
|
||||
|
||||
|
||||
def _is_sent(d: Dict) -> bool:
|
||||
return d.get('node') == 'c'
|
||||
|
||||
|
||||
def _is_word(d: Dict) -> bool:
|
||||
return d.get('node') == 'l'
|
||||
|
||||
|
||||
def _get_addition(d: Dict) -> str:
|
||||
"""
|
||||
This function looks for an asterisk or a question mark in a dictionary representing a word in a tablet from ORACC.
|
||||
|
||||
:param d: A given dictionary as described above.
|
||||
:return An asterisk or a question mark if one of the word's signs has one, otherwise an empty string.
|
||||
"""
|
||||
has_signs_dicts = 'f' in d and 'gdl' in d.get('f')
|
||||
if has_signs_dicts:
|
||||
for sign_dict in d['f']['gdl']:
|
||||
if 'gdl_collated' in sign_dict: # If cur sign has an asterisk
|
||||
return "*"
|
||||
if 'queried' in sign_dict: # If cur sign has a question mark
|
||||
return "?"
|
||||
return ""
|
Reference in New Issue
Block a user