trying to scrape
This commit is contained in:
		
							
								
								
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								.gitignore
									
									
									
									
										vendored
									
									
										Normal file
									
								
							@@ -0,0 +1 @@
 | 
				
			|||||||
 | 
					venv/*
 | 
				
			||||||
							
								
								
									
										1
									
								
								data.jsonl
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										1
									
								
								data.jsonl
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because one or more lines are too long
											
										
									
								
							
							
								
								
									
										54
									
								
								project_list
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										54
									
								
								project_list
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,54 @@
 | 
				
			|||||||
 | 
					adsd
 | 
				
			||||||
 | 
					aemw
 | 
				
			||||||
 | 
					akklove
 | 
				
			||||||
 | 
					amgg
 | 
				
			||||||
 | 
					ario
 | 
				
			||||||
 | 
					armep
 | 
				
			||||||
 | 
					arrim
 | 
				
			||||||
 | 
					asbp
 | 
				
			||||||
 | 
					atae
 | 
				
			||||||
 | 
					babcity
 | 
				
			||||||
 | 
					blms
 | 
				
			||||||
 | 
					btmao
 | 
				
			||||||
 | 
					btto
 | 
				
			||||||
 | 
					cams
 | 
				
			||||||
 | 
					caspo
 | 
				
			||||||
 | 
					ccpo
 | 
				
			||||||
 | 
					cdli
 | 
				
			||||||
 | 
					ckst
 | 
				
			||||||
 | 
					cmawro
 | 
				
			||||||
 | 
					contrib
 | 
				
			||||||
 | 
					contrib/amarna
 | 
				
			||||||
 | 
					contrib/lambert
 | 
				
			||||||
 | 
					ctij
 | 
				
			||||||
 | 
					dcclt
 | 
				
			||||||
 | 
					dccmt
 | 
				
			||||||
 | 
					doc
 | 
				
			||||||
 | 
					dsst
 | 
				
			||||||
 | 
					ecut
 | 
				
			||||||
 | 
					eisl
 | 
				
			||||||
 | 
					epsd2
 | 
				
			||||||
 | 
					etcsri
 | 
				
			||||||
 | 
					glass
 | 
				
			||||||
 | 
					hbtin
 | 
				
			||||||
 | 
					lacost
 | 
				
			||||||
 | 
					lovelyrics
 | 
				
			||||||
 | 
					neo
 | 
				
			||||||
 | 
					nere
 | 
				
			||||||
 | 
					nimrud
 | 
				
			||||||
 | 
					obel
 | 
				
			||||||
 | 
					obmc
 | 
				
			||||||
 | 
					obta
 | 
				
			||||||
 | 
					ogsl
 | 
				
			||||||
 | 
					oimea
 | 
				
			||||||
 | 
					pnao
 | 
				
			||||||
 | 
					qcat
 | 
				
			||||||
 | 
					riao
 | 
				
			||||||
 | 
					ribo
 | 
				
			||||||
 | 
					rimanum
 | 
				
			||||||
 | 
					rinap
 | 
				
			||||||
 | 
					saao
 | 
				
			||||||
 | 
					suhu
 | 
				
			||||||
 | 
					tcma
 | 
				
			||||||
 | 
					tsae
 | 
				
			||||||
 | 
					xcat
 | 
				
			||||||
							
								
								
									
										70
									
								
								project_notebook.ipynb
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										70
									
								
								project_notebook.ipynb
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,70 @@
 | 
				
			|||||||
 | 
					{
 | 
				
			||||||
 | 
					 "cells": [
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "attachments": {},
 | 
				
			||||||
 | 
					   "cell_type": "markdown",
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "## Imported modules"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": 8,
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "import json\n",
 | 
				
			||||||
 | 
					    "import scrapping\n",
 | 
				
			||||||
 | 
					    "import psycopg2\n",
 | 
				
			||||||
 | 
					    "from psycopg2.extras import execute_batch\n",
 | 
				
			||||||
 | 
					    "# from psycopg2.extensions import register_adapter"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": 12,
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": [
 | 
				
			||||||
 | 
					    "conn = psycopg2.connect(\"dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'\")\n",
 | 
				
			||||||
 | 
					    "connection = conn.cursor()\n",
 | 
				
			||||||
 | 
					    "with open('project_list') as f:\n",
 | 
				
			||||||
 | 
					    "    for project in f.read().split('\\n'):\n",
 | 
				
			||||||
 | 
					    "        scrap = scrapping.get_raw_english_texts_of_project(project)\n",
 | 
				
			||||||
 | 
					    "        connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n",
 | 
				
			||||||
 | 
					    "        execute_batch(connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n",
 | 
				
			||||||
 | 
					    "        conn.commit()"
 | 
				
			||||||
 | 
					   ]
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  {
 | 
				
			||||||
 | 
					   "cell_type": "code",
 | 
				
			||||||
 | 
					   "execution_count": null,
 | 
				
			||||||
 | 
					   "metadata": {},
 | 
				
			||||||
 | 
					   "outputs": [],
 | 
				
			||||||
 | 
					   "source": []
 | 
				
			||||||
 | 
					  }
 | 
				
			||||||
 | 
					 ],
 | 
				
			||||||
 | 
					 "metadata": {
 | 
				
			||||||
 | 
					  "kernelspec": {
 | 
				
			||||||
 | 
					   "display_name": "venv",
 | 
				
			||||||
 | 
					   "language": "python",
 | 
				
			||||||
 | 
					   "name": "python3"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "language_info": {
 | 
				
			||||||
 | 
					   "codemirror_mode": {
 | 
				
			||||||
 | 
					    "name": "ipython",
 | 
				
			||||||
 | 
					    "version": 3
 | 
				
			||||||
 | 
					   },
 | 
				
			||||||
 | 
					   "file_extension": ".py",
 | 
				
			||||||
 | 
					   "mimetype": "text/x-python",
 | 
				
			||||||
 | 
					   "name": "python",
 | 
				
			||||||
 | 
					   "nbconvert_exporter": "python",
 | 
				
			||||||
 | 
					   "pygments_lexer": "ipython3",
 | 
				
			||||||
 | 
					   "version": "3.9.13"
 | 
				
			||||||
 | 
					  },
 | 
				
			||||||
 | 
					  "orig_nbformat": 4
 | 
				
			||||||
 | 
					 },
 | 
				
			||||||
 | 
					 "nbformat": 4,
 | 
				
			||||||
 | 
					 "nbformat_minor": 2
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
							
								
								
									
										2627
									
								
								scrape.log
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										2627
									
								
								scrape.log
									
									
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
							
								
								
									
										9
									
								
								scrape.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								scrape.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,9 @@
 | 
				
			|||||||
 | 
					import urllib3
 | 
				
			||||||
 | 
					import bs4
 | 
				
			||||||
 | 
					import psycopg2
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					# def connect_postgres():
 | 
				
			||||||
 | 
					#     conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
 | 
				
			||||||
 | 
					#     return conn
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def 
 | 
				
			||||||
							
								
								
									
										227
									
								
								scrapping.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										227
									
								
								scrapping.py
									
									
									
									
									
										Normal file
									
								
							@@ -0,0 +1,227 @@
 | 
				
			|||||||
 | 
					import json
 | 
				
			||||||
 | 
					from typing import Dict, List
 | 
				
			||||||
 | 
					import requests
 | 
				
			||||||
 | 
					from bs4 import BeautifulSoup
 | 
				
			||||||
 | 
					import os
 | 
				
			||||||
 | 
					from pathlib import Path
 | 
				
			||||||
 | 
					import re
 | 
				
			||||||
 | 
					import glob
 | 
				
			||||||
 | 
					import logging
 | 
				
			||||||
 | 
					# parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 | 
				
			||||||
 | 
					# os.chdir(parent_dir)
 | 
				
			||||||
 | 
					logging.basicConfig(filename='scrape.log', level=logging.INFO)
 | 
				
			||||||
 | 
					JSONS_DIR = "jsons_unzipped"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					SUB_PROJECTS_IN_NEO_ASS = ["01", "05", "06", "07", "09", "11", "12", "14", "15", "16"]
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					CORPUS_DIRNAME = "corpusjson"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _load_json_from_path(json_path: str) -> Dict:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This helper function loads a json from a given path, with the exception of empty files.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    :param json_path: A given string representing a path to a json file.
 | 
				
			||||||
 | 
					    :return: The json file (if it is a valid json file).
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    with open(json_path, "r", encoding='utf-8') as json_file:
 | 
				
			||||||
 | 
					        if os.stat(json_path).st_size != 0:  # If the file is not empty:
 | 
				
			||||||
 | 
					            return json.load(json_file)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_raw_english_texts_of_project(project_dirname: str) -> List[Dict]:
 | 
				
			||||||
 | 
					    raw_jsons = list()
 | 
				
			||||||
 | 
					    all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
 | 
				
			||||||
 | 
					    # path = Path(os.path.join(JSONS_DIR, project_dirname, 'catalogue.json'))
 | 
				
			||||||
 | 
					    # if not os.path.isfile(path):
 | 
				
			||||||
 | 
					    #     return raw_jsons
 | 
				
			||||||
 | 
					    # d = _load_json_to_dict(str(path))
 | 
				
			||||||
 | 
					    # if d and d.get('members'):
 | 
				
			||||||
 | 
					    # for member in d.get('members').values():
 | 
				
			||||||
 | 
					    for filename in all_paths:
 | 
				
			||||||
 | 
					        cur_json = _load_json_from_path(filename)
 | 
				
			||||||
 | 
					        project_name = cur_json['project']
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # # Skip in case we are in saa project and the current sub project is not in neo-assyrian
 | 
				
			||||||
 | 
					        # if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
 | 
				
			||||||
 | 
					        #     continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        # id_text = member.get('id_text', "") + member.get('id_composite', "")
 | 
				
			||||||
 | 
					        # html_dir = "/".join(path.parts[1:-1])
 | 
				
			||||||
 | 
					        url = f"http://oracc.iaas.upenn.edu/{project_name}/{cur_json['textid']}/html"
 | 
				
			||||||
 | 
					        # print(url)
 | 
				
			||||||
 | 
					        logging.info(url)
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            res = requests.get(url)
 | 
				
			||||||
 | 
					            soup = BeautifulSoup(res.text, "html.parser")
 | 
				
			||||||
 | 
					            results = soup.find_all("span", {"class": "cell"})
 | 
				
			||||||
 | 
					            raw_text = " ".join(["".join([content if isinstance(content, str) else content.text
 | 
				
			||||||
 | 
					                                        for content in result.contents]) for result in results])
 | 
				
			||||||
 | 
					            raw_text = raw_text.replace('\n', ' ')
 | 
				
			||||||
 | 
					            if raw_text:
 | 
				
			||||||
 | 
					                raw_jsons.append({
 | 
				
			||||||
 | 
					                    "id_text": cur_json['textid'],
 | 
				
			||||||
 | 
					                    "project_name": project_name,
 | 
				
			||||||
 | 
					                    "raw_text": raw_text,
 | 
				
			||||||
 | 
					                })
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            logging.error(e)
 | 
				
			||||||
 | 
					    return raw_jsons
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def num_words_in_english(jsonl_file):
 | 
				
			||||||
 | 
					    words_counter = 0
 | 
				
			||||||
 | 
					    with open(jsonl_file, "r", encoding='utf-8') as f_in:
 | 
				
			||||||
 | 
					        for line in f_in:
 | 
				
			||||||
 | 
					            cur_json = json.loads(line)
 | 
				
			||||||
 | 
					            if cur_json["project_name"].startswith("saa"):
 | 
				
			||||||
 | 
					                cur_json["raw_text"] = re.sub(r'\([^)]*\)', '', cur_json["raw_text"])
 | 
				
			||||||
 | 
					                words_counter += len(cur_json["raw_text"].split())
 | 
				
			||||||
 | 
					    print(words_counter)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_raw_text_akk_from_html(id_text, project_name):
 | 
				
			||||||
 | 
					    # else:  # If the file doesn't exist in the jsons -> look for it online
 | 
				
			||||||
 | 
					    url = f'http://oracc.iaas.upenn.edu/{project_name}/{id_text}/html'
 | 
				
			||||||
 | 
					    res = requests.get(url)
 | 
				
			||||||
 | 
					    if res.status_code != 200:
 | 
				
			||||||
 | 
					        print("******STATUS CODE IS NOT 200***********")
 | 
				
			||||||
 | 
					        return ""
 | 
				
			||||||
 | 
					    soup = BeautifulSoup(res.text, "html.parser")
 | 
				
			||||||
 | 
					    # print(f"Check this out:\n{url}")
 | 
				
			||||||
 | 
					    raw_text = _get_raw_text_html(soup)
 | 
				
			||||||
 | 
					    # cur_json = {"id_text": id_text, "project_name": project_name, "raw_text": raw_text}
 | 
				
			||||||
 | 
					    return raw_text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_raw_text_html1(soup):
 | 
				
			||||||
 | 
					    words = soup.find_all("a", class_="cbd")
 | 
				
			||||||
 | 
					    return ' '.join(tag.text for tag in soup.find_all('p', class_='tt'))
 | 
				
			||||||
 | 
					    return ' '.join([word.text for word in words])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_raw_text_html(soup):
 | 
				
			||||||
 | 
					    tags = soup.find_all('span', class_=lambda value: value and value.startswith('w '))
 | 
				
			||||||
 | 
					    signs = list()
 | 
				
			||||||
 | 
					    for tag in tags:
 | 
				
			||||||
 | 
					        temp_tag = tag.find('a')
 | 
				
			||||||
 | 
					        if temp_tag:
 | 
				
			||||||
 | 
					            tag = temp_tag
 | 
				
			||||||
 | 
					        for sign in tag.contents:
 | 
				
			||||||
 | 
					            if isinstance(sign, str):
 | 
				
			||||||
 | 
					                signs.append(sign)
 | 
				
			||||||
 | 
					            elif sign.name == 'span':
 | 
				
			||||||
 | 
					                signs.append(sign.text)
 | 
				
			||||||
 | 
					            elif sign.name == 'sup':
 | 
				
			||||||
 | 
					                signs.append("{" + sign.text + "}")
 | 
				
			||||||
 | 
					    return ' '.join(signs)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function parses the raw texts of a project in ORACC.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    :param project_dirname: A given string representing the path to the project's directory.
 | 
				
			||||||
 | 
					    :return: A list of jsons containing the raw texts of the given project and basic metadata.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    raw_jsons = list()
 | 
				
			||||||
 | 
					    all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    for filename in all_paths:
 | 
				
			||||||
 | 
					        cur_json = _load_json_from_path(filename)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        try:
 | 
				
			||||||
 | 
					            project_name = cur_json['project']
 | 
				
			||||||
 | 
					            sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
 | 
				
			||||||
 | 
					        except Exception as e:
 | 
				
			||||||
 | 
					            print(f"In file {filename} failed because of {e}")
 | 
				
			||||||
 | 
					            continue
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					        raw_text = get_raw_akk_text_from_json(sents_dicts)
 | 
				
			||||||
 | 
					        raw_jsons.append({
 | 
				
			||||||
 | 
					            "id_text": cur_json['textid'],
 | 
				
			||||||
 | 
					            "project_name": project_name,
 | 
				
			||||||
 | 
					            "raw_text": raw_text,
 | 
				
			||||||
 | 
					        })
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    # if not texts_jsons or not texts_jsons.get('members'):
 | 
				
			||||||
 | 
					    #     return raw_jsons
 | 
				
			||||||
 | 
					    # for member in texts_jsons.get('members').values():  # Iterate over different tablets:
 | 
				
			||||||
 | 
					    #     project_name = member['project'].split("/")[-1]
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #     # Skip in case we are in saa project and the current sub project is not in neo-assyrian
 | 
				
			||||||
 | 
					    #     if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS:  # TODO: validate
 | 
				
			||||||
 | 
					    #         continue
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #     id_text = member.get("id_text", "") + member.get("id_composite", "")
 | 
				
			||||||
 | 
					    #     json_file_path = os.path.join(JSONS_DIR, Path(member['project']), CORPUS_DIRNAME, f'{id_text}.json')
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #     if os.path.isfile(json_file_path):  # If file exists in the jsons
 | 
				
			||||||
 | 
					    #         d = _load_json_to_dict(json_file_path)
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #         try:
 | 
				
			||||||
 | 
					    #             sents_dicts = d['cdl'][0]['cdl'][-1]['cdl']
 | 
				
			||||||
 | 
					    #         except Exception as e:
 | 
				
			||||||
 | 
					    #             print(f"In file {json_file_path} failed because of {e}")
 | 
				
			||||||
 | 
					    #             continue
 | 
				
			||||||
 | 
					    #
 | 
				
			||||||
 | 
					    #         raw_text = " ".join([_get_raw_text(sent_dict['cdl']) for sent_dict in sents_dicts if _is_sent(sent_dict)])
 | 
				
			||||||
 | 
					    #         cur_json = {
 | 
				
			||||||
 | 
					    #             "id_text": id_text,
 | 
				
			||||||
 | 
					    #             "project_name": project_name,
 | 
				
			||||||
 | 
					    #             "raw_text": raw_text,
 | 
				
			||||||
 | 
					    #         }
 | 
				
			||||||
 | 
					    #         raw_jsons.append(cur_json)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return raw_jsons
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def get_raw_akk_text_from_json(sents_dicts):
 | 
				
			||||||
 | 
					    return " ".join([_get_raw_text(sent_dict['cdl']) for sent_dict in sents_dicts if _is_sent(sent_dict)])
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_raw_text(json_dict: dict) -> str:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function gets the raw text of a given transliterated tablet in ORACC recursively.
 | 
				
			||||||
 | 
					    It appends each instance in the tablet only once (even if there are multiple possible meanings).
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    :param json_dict: A given dictionary representing some portion of the words in the tablet.
 | 
				
			||||||
 | 
					    :return: The aforementioned raw text.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    previous_ref: str = ""
 | 
				
			||||||
 | 
					    raw_texts = list()
 | 
				
			||||||
 | 
					    for d in json_dict:
 | 
				
			||||||
 | 
					        if _is_sent(d):  # If node represents a sentence -> call recursively to the inner dictionary
 | 
				
			||||||
 | 
					            raw_texts.extend(_get_raw_text(d['cdl']).split())
 | 
				
			||||||
 | 
					        elif _is_word(d):  # If node represents a word
 | 
				
			||||||
 | 
					            if previous_ref != d.get('ref'):  # If encountered new instance:
 | 
				
			||||||
 | 
					                cur_text = d['frag'] if d.get('frag') else d['f']['form']
 | 
				
			||||||
 | 
					                raw_texts.append(cur_text + _get_addition(d))
 | 
				
			||||||
 | 
					                previous_ref = d.get('ref')
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    return " ".join(raw_texts)
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _is_sent(d: Dict) -> bool:
 | 
				
			||||||
 | 
					    return d.get('node') == 'c'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _is_word(d: Dict) -> bool:
 | 
				
			||||||
 | 
					    return d.get('node') == 'l'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					def _get_addition(d: Dict) -> str:
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    This function looks for an asterisk or a question mark in a dictionary representing a word in a tablet from ORACC.
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    :param d: A given dictionary as described above.
 | 
				
			||||||
 | 
					    :return An asterisk or a question mark if one of the word's signs has one, otherwise an empty string.
 | 
				
			||||||
 | 
					    """
 | 
				
			||||||
 | 
					    has_signs_dicts = 'f' in d and 'gdl' in d.get('f')
 | 
				
			||||||
 | 
					    if has_signs_dicts:
 | 
				
			||||||
 | 
					        for sign_dict in d['f']['gdl']:
 | 
				
			||||||
 | 
					            if 'gdl_collated' in sign_dict:  # If cur sign has an asterisk
 | 
				
			||||||
 | 
					                return "*"
 | 
				
			||||||
 | 
					            if 'queried' in sign_dict:  # If cur sign has a question mark
 | 
				
			||||||
 | 
					                return "?"
 | 
				
			||||||
 | 
					    return ""
 | 
				
			||||||
		Reference in New Issue
	
	Block a user