trying to scrape

2023-04-12 00:02:07 +03:00
parent c2497dd1c6
commit e25ad40379
7 changed files with 2989 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
 venv/*
--- a/data.jsonl
+++ b/data.jsonl
--- a/54
+++ b/54
@@ -0,0 +1,54 @@
 adsd
 aemw
 akklove
 amgg
 ario
 armep
 arrim
 asbp
 atae
 babcity
 blms
 btmao
 btto
 cams
 caspo
 ccpo
 cdli
 ckst
 cmawro
 contrib
 contrib/amarna
 contrib/lambert
 ctij
 dcclt
 dccmt
 doc
 dsst
 ecut
 eisl
 epsd2
 etcsri
 glass
 hbtin
 lacost
 lovelyrics
 neo
 nere
 nimrud
 obel
 obmc
 obta
 ogsl
 oimea
 pnao
 qcat
 riao
 ribo
 rimanum
 rinap
 saao
 suhu
 tcma
 tsae
 xcat
--- a/project_notebook.ipynb
+++ b/project_notebook.ipynb
@@ -0,0 +1,70 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imported modules"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import scrapping\n",
    "import psycopg2\n",
    "from psycopg2.extras import execute_batch\n",
    "# from psycopg2.extensions import register_adapter"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
    "connection = conn.cursor()\n",
    "with open('project_list') as f:\n",
    "    for project in f.read().split('\\n'):\n",
    "        scrap = scrapping.get_raw_english_texts_of_project(project)\n",
    "        connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n",
    "        execute_batch(connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n",
    "        conn.commit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "venv",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.13"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/scrape.log
+++ b/scrape.log
--- a/scrape.py
+++ b/scrape.py
@@ -0,0 +1,9 @@
 import urllib3
 import bs4
 import psycopg2
 # def connect_postgres():
 #     conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
 #     return conn
 def 
--- a/scrapping.py
+++ b/scrapping.py
@@ -0,0 +1,227 @@
 import json
 from typing import Dict, List
 import requests
 from bs4 import BeautifulSoup
 import os
 from pathlib import Path
 import re
 import glob
 import logging
 # parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 # os.chdir(parent_dir)
 logging.basicConfig(filename='scrape.log', level=logging.INFO)
 JSONS_DIR = "jsons_unzipped"
 SUB_PROJECTS_IN_NEO_ASS = ["01", "05", "06", "07", "09", "11", "12", "14", "15", "16"]
 CORPUS_DIRNAME = "corpusjson"
 def _load_json_from_path(json_path: str) -> Dict:
    """
    This helper function loads a json from a given path, with the exception of empty files.
    :param json_path: A given string representing a path to a json file.
    :return: The json file (if it is a valid json file).
    """
    with open(json_path, "r", encoding='utf-8') as json_file:
        if os.stat(json_path).st_size != 0:  # If the file is not empty:
            return json.load(json_file)
 def get_raw_english_texts_of_project(project_dirname: str) -> List[Dict]:
    raw_jsons = list()
    all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
    # path = Path(os.path.join(JSONS_DIR, project_dirname, 'catalogue.json'))
    # if not os.path.isfile(path):
    #     return raw_jsons
    # d = _load_json_to_dict(str(path))
    # if d and d.get('members'):
    # for member in d.get('members').values():
    for filename in all_paths:
        cur_json = _load_json_from_path(filename)
        project_name = cur_json['project']
        # # Skip in case we are in saa project and the current sub project is not in neo-assyrian
        # if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
        #     continue
        # id_text = member.get('id_text', "") + member.get('id_composite', "")
        # html_dir = "/".join(path.parts[1:-1])
        url = f"http://oracc.iaas.upenn.edu/{project_name}/{cur_json['textid']}/html"
        # print(url)
        logging.info(url)
        try:
            res = requests.get(url)
            soup = BeautifulSoup(res.text, "html.parser")
            results = soup.find_all("span", {"class": "cell"})
            raw_text = " ".join(["".join([content if isinstance(content, str) else content.text
                                        for content in result.contents]) for result in results])
            raw_text = raw_text.replace('\n', ' ')
            if raw_text:
                raw_jsons.append({
                    "id_text": cur_json['textid'],
                    "project_name": project_name,
                    "raw_text": raw_text,
                })
        except Exception as e:
            logging.error(e)
    return raw_jsons
 def num_words_in_english(jsonl_file):
    words_counter = 0
    with open(jsonl_file, "r", encoding='utf-8') as f_in:
        for line in f_in:
            cur_json = json.loads(line)
            if cur_json["project_name"].startswith("saa"):
                cur_json["raw_text"] = re.sub(r'\([^)]*\)', '', cur_json["raw_text"])
                words_counter += len(cur_json["raw_text"].split())
    print(words_counter)
 def get_raw_text_akk_from_html(id_text, project_name):
    # else:  # If the file doesn't exist in the jsons -> look for it online
    url = f'http://oracc.iaas.upenn.edu/{project_name}/{id_text}/html'
    res = requests.get(url)
    if res.status_code != 200:
        print("******STATUS CODE IS NOT 200***********")
        return ""
    soup = BeautifulSoup(res.text, "html.parser")
    # print(f"Check this out:\n{url}")
    raw_text = _get_raw_text_html(soup)
    # cur_json = {"id_text": id_text, "project_name": project_name, "raw_text": raw_text}
    return raw_text
 def _get_raw_text_html1(soup):
    words = soup.find_all("a", class_="cbd")
    return ' '.join(tag.text for tag in soup.find_all('p', class_='tt'))
    return ' '.join([word.text for word in words])
 def _get_raw_text_html(soup):
    tags = soup.find_all('span', class_=lambda value: value and value.startswith('w '))
    signs = list()
    for tag in tags:
        temp_tag = tag.find('a')
        if temp_tag:
            tag = temp_tag
        for sign in tag.contents:
            if isinstance(sign, str):
                signs.append(sign)
            elif sign.name == 'span':
                signs.append(sign.text)
            elif sign.name == 'sup':
                signs.append("{" + sign.text + "}")
    return ' '.join(signs)
 def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
    """
    This function parses the raw texts of a project in ORACC.
    :param project_dirname: A given string representing the path to the project's directory.
    :return: A list of jsons containing the raw texts of the given project and basic metadata.
    """
    raw_jsons = list()
    all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
    for filename in all_paths:
        cur_json = _load_json_from_path(filename)
        try:
            project_name = cur_json['project']
            sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
        except Exception as e:
            print(f"In file {filename} failed because of {e}")
            continue
        raw_text = get_raw_akk_text_from_json(sents_dicts)
        raw_jsons.append({
            "id_text": cur_json['textid'],
            "project_name": project_name,
            "raw_text": raw_text,
        })
    # if not texts_jsons or not texts_jsons.get('members'):
    #     return raw_jsons
    # for member in texts_jsons.get('members').values():  # Iterate over different tablets:
    #     project_name = member['project'].split("/")[-1]
    #
    #     # Skip in case we are in saa project and the current sub project is not in neo-assyrian
    #     if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
    #         continue
    #
    #     id_text = member.get("id_text", "") + member.get("id_composite", "")
    #     json_file_path = os.path.join(JSONS_DIR, Path(member['project']), CORPUS_DIRNAME, f'{id_text}.json')
    #
    #     if os.path.isfile(json_file_path):  # If file exists in the jsons
    #         d = _load_json_to_dict(json_file_path)
    #
    #         try:
    #             sents_dicts = d['cdl'][0]['cdl'][-1]['cdl']
    #         except Exception as e:
    #             print(f"In file {json_file_path} failed because of {e}")
    #             continue
    #
    #         raw_text = " ".join([_get_raw_text(sent_dict['cdl']) for sent_dict in sents_dicts if _is_sent(sent_dict)])
    #         cur_json = {
    #             "id_text": id_text,
    #             "project_name": project_name,
    #             "raw_text": raw_text,
    #         }
    #         raw_jsons.append(cur_json)
    return raw_jsons
 def get_raw_akk_text_from_json(sents_dicts):
    return " ".join([_get_raw_text(sent_dict['cdl']) for sent_dict in sents_dicts if _is_sent(sent_dict)])
 def _get_raw_text(json_dict: dict) -> str:
    """
    This function gets the raw text of a given transliterated tablet in ORACC recursively.
    It appends each instance in the tablet only once (even if there are multiple possible meanings).
    :param json_dict: A given dictionary representing some portion of the words in the tablet.
    :return: The aforementioned raw text.
    """
    previous_ref: str = ""
    raw_texts = list()
    for d in json_dict:
        if _is_sent(d):  # If node represents a sentence -> call recursively to the inner dictionary
            raw_texts.extend(_get_raw_text(d['cdl']).split())
        elif _is_word(d):  # If node represents a word
            if previous_ref != d.get('ref'):  # If encountered new instance:
                cur_text = d['frag'] if d.get('frag') else d['f']['form']
                raw_texts.append(cur_text + _get_addition(d))
                previous_ref = d.get('ref')
    return " ".join(raw_texts)
 def _is_sent(d: Dict) -> bool:
    return d.get('node') == 'c'
 def _is_word(d: Dict) -> bool:
    return d.get('node') == 'l'
 def _get_addition(d: Dict) -> str:
    """
    This function looks for an asterisk or a question mark in a dictionary representing a word in a tablet from ORACC.
    :param d: A given dictionary as described above.
    :return An asterisk or a question mark if one of the word's signs has one, otherwise an empty string.
    """
    has_signs_dicts = 'f' in d and 'gdl' in d.get('f')
    if has_signs_dicts:
        for sign_dict in d['f']['gdl']:
            if 'gdl_collated' in sign_dict:  # If cur sign has an asterisk
                return "*"
            if 'queried' in sign_dict:  # If cur sign has a question mark
                return "?"
    return ""