Compare commits

...

42 Commits

Author SHA1 Message Date
8b8e15b082 auto rtl has been added 2023-10-21 15:37:32 +03:00
8352a0a097 summery 2023-08-12 18:19:37 +03:00
0e26118247 remove problems 2023-08-12 18:13:10 +03:00
f8e1c4d062 demostration 2023-08-12 17:46:38 +03:00
46152eadbf update of processing 2023-08-12 15:41:13 +03:00
01525451c7 added processing 2023-08-10 18:09:19 +03:00
be4e16ed35 update the project data collecting and the steps for it 2023-08-09 18:54:42 +03:00
5f91215acd update the project data collecting and the steps for it 2023-08-09 17:20:43 +03:00
server
1e4f87368e update the report progress 2023-08-09 00:37:42 +03:00
server
aad15a2a5a readme 2023-08-08 18:53:05 +03:00
server
ee5983a7c5 updated source data, data grab 2023-08-08 18:36:53 +03:00
server
acc006df1b Merge branch 'master' of https://git.saret.tk/saret/DH 2023-08-08 17:11:29 +03:00
server
31d2007bcb no raw data 2023-08-08 17:06:12 +03:00
server
0b66f6cf1d no raw data 2023-08-08 17:06:12 +03:00
server
e7e18c3300 updated goals 2023-08-08 16:59:57 +03:00
server
cc8dfeea0d updated goals 2023-08-08 16:59:57 +03:00
server
8f0dd858e2 readme update 2023-08-08 16:28:08 +03:00
server
0448b6c447 readme update 2023-08-08 16:28:08 +03:00
server
1b6d0d2129 readme update 2023-08-08 16:27:47 +03:00
server
78e9e7502a readme update 2023-08-08 16:27:47 +03:00
server
afe0eaf41d readme update 2023-08-08 16:26:18 +03:00
server
adad325c44 readme update 2023-08-08 16:26:18 +03:00
server
0118af822c readme update 2023-08-08 16:26:01 +03:00
server
f784ad9999 readme update 2023-08-08 16:26:01 +03:00
server
bab0735bf7 readme up 2023-08-08 16:25:04 +03:00
server
5af637c650 readme up 2023-08-08 16:25:04 +03:00
server
1545cdac8d starting the report 2023-08-08 16:08:55 +03:00
server
fcdbfe86fa starting the report 2023-08-08 16:08:55 +03:00
server
201626c66a update report 2023-08-08 15:57:25 +03:00
server
5233079481 update report 2023-08-08 15:57:25 +03:00
2735fb9ea2 failed scraping 2023-06-27 11:53:56 +03:00
09aa16dcc8 failed scraping 2023-06-27 11:53:56 +03:00
98d3d5994f boolean similarity 2023-06-26 23:21:34 +03:00
03f1d663d0 boolean similarity 2023-06-26 23:21:34 +03:00
826a100f24 update 2023-06-26 23:12:28 +03:00
db8244d902 update 2023-06-26 23:12:28 +03:00
server
26bbbe7d8c updates from server 2023-04-19 06:59:45 +03:00
89ec3a3578 update the progress 2023-04-17 02:51:50 +03:00
a9e93bd99f requirements 2023-04-16 20:24:47 +03:00
4aaeb48ffb remove irelevant files 2023-04-13 14:03:15 +03:00
218a3d8135 finished scrapping all the data 2023-04-12 22:05:16 +03:00
df548fa29d update folders 2023-04-12 12:34:56 +03:00
66 changed files with 178121 additions and 30443 deletions

7
.gitignore vendored
View File

@@ -1 +1,6 @@
venv/*
venv/*
logs/*
.vscode/*
.ipynb_checkpoints/*
__pycache__/*
*.csv

File diff suppressed because it is too large Load Diff

View File

@@ -1,5 +0,0 @@
root - INFO - http://oracc.iaas.upenn.edu/ario/Q006653/html
root - INFO - http://oracc.iaas.upenn.edu/ario/Q006654/html
root - INFO - http://oracc.iaas.upenn.edu/ario/Q006655/html
root - INFO - http://oracc.iaas.upenn.edu/ario/Q007129/html
root - INFO - http://oracc.iaas.upenn.edu/ario/Q007130/html

File diff suppressed because it is too large Load Diff

View File

@@ -1,447 +0,0 @@
root - INFO - Start
root - INFO - Start
root - INFO - reading adsd
root - INFO - reading aemw
root - INFO - reading akklove
root - INFO - reading asbp
root - INFO - reading atae
root - INFO - reading babcity
root - INFO - reading blms
root - INFO - reading btmao
root - INFO - reading btto
root - INFO - reading cams
root - INFO - reading ccpo
root - INFO - reading cdli
root - INFO - reading ckst
root - INFO - reading cmawro
root - INFO - reading contrib
root - INFO - reading dcclt
root - INFO - reading dccmt
root - INFO - reading doc
root - INFO - reading dsst
root - INFO - reading ecut
root - INFO - reading eisl
root - INFO - reading epsd2
root - INFO - reading etcsri
root - INFO - reading glass
root - INFO - reading hbtin
root - INFO - reading lacost
root - INFO - reading lovelyrics
root - INFO - reading neo
root - INFO - reading nere
root - INFO - reading nimrud
root - INFO - reading obel
root - INFO - reading obmc
root - INFO - reading obta
root - INFO - reading ogsl
root - INFO - reading oimea
root - INFO - reading pnao
root - INFO - reading qcat
root - INFO - reading riao
root - INFO - reading ribo
root - INFO - reading rimanum
root - INFO - reading rinap
root - INFO - reading saao
root - INFO - reading suhu
root - INFO - reading tcma
root - INFO - reading tsae
root - INFO - reading xcat
sroot - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224395/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224403/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224417/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224431/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224433/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224447/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224485/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224487/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P224587/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313416/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313417/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313425/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313427/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313435/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313437/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313439/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313447/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313458/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313487/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313491/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313497/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313502/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313505/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313509/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313511/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313523/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313527/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313543/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313551/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313559/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313571/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313598/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313600/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313614/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313623/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313626/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313627/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313629/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313644/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313648/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313660/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313677/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313684/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313699/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313719/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313722/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313726/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313742/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313748/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313755/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313759/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313762/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313807/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313815/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313832/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313854/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313864/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313871/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313874/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313876/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313878/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313879/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313885/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313897/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313904/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313907/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313915/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313919/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313923/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313938/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313947/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313974/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P313975/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314001/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314003/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314022/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314026/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314030/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314032/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314048/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314051/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314054/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314056/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314134/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314144/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314157/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314211/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314223/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314227/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314232/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314238/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314243/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314248/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314257/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314260/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314272/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314273/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314275/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314282/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314287/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314297/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P314316/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334036/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334037/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334038/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334039/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334040/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334041/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334042/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334043/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334044/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334045/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334046/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334047/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334048/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334049/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334050/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334051/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334053/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334054/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334055/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334056/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334078/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334079/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334080/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334081/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334082/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334083/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334100/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334101/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334102/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334113/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334118/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334120/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334124/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334125/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334127/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334135/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334136/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334141/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334142/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334143/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334144/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334158/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334160/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334165/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334166/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334170/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334171/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334172/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334173/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334174/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334175/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334176/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334190/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334193/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334194/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334195/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334209/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334210/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334211/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334212/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334213/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334214/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334271/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334272/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334273/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334284/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334288/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334298/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334314/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334317/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334328/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334329/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334330/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334331/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334332/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334333/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334334/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334335/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334336/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334359/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334372/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334384/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334385/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334390/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334394/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334396/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334397/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334403/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334422/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334432/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334435/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334442/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334443/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334444/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334445/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334496/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334499/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334512/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334519/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334520/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334567/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334568/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334586/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334587/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334588/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334592/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334598/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334610/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334621/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334631/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334632/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334634/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334643/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334644/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334658/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334665/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334667/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334676/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334687/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334689/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334693/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334699/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334709/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334715/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334718/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334721/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334727/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334728/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334729/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334739/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334773/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334774/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334776/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334789/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334791/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334792/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334794/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334804/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334807/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334810/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334820/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334826/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334830/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334834/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334849/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334864/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334895/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334903/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334904/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334910/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334912/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334918/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334922/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P334923/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P336167/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P336172/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P336595/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P336596/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P336597/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P393855/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P393866/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/P428858/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa01/X010028/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P237185/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P240211/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P296062/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P314346/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P334814/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336039/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336040/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336126/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336216/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336217/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336218/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336220/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P336317/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/P500551/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa02/Q009186/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P223388/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P237351/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P238051/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P238321/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P238357/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P313430/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P313818/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P334919/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P334925/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P334926/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P334929/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P334930/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P334931/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P334932/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336128/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336130/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336144/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336149/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336150/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336151/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336158/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336161/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336175/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336225/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336226/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336234/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336242/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336243/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336244/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336245/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336291/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336306/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336599/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336600/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336601/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336602/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336603/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336604/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336605/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336606/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336607/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336608/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336609/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P336796/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P337164/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P338360/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P338383/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P338404/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P338675/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/P338681/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/Q009249/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa03/Q009250/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236925/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236926/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236927/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236928/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236929/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236937/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236943/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236955/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236956/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P236960/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237009/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237018/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237027/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237030/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237053/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237081/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237119/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237127/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237128/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237168/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237173/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237190/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237191/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237203/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237208/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237209/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237210/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237212/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237213/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237222/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237224/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237231/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237355/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237358/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237360/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237361/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237362/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237363/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237364/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237365/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237366/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237367/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237369/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237370/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237371/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237372/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237373/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237374/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237376/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237377/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237378/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237380/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237383/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237386/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237387/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237405/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237410/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237412/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237413/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237416/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237417/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237423/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237435/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237443/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237449/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237461/html
root - INFO - http://oracc.museum.upenn.edu/saao/saa04/P237479/html

0
2023-05-15_21-45-30.log Normal file
View File

View File

@@ -1,5 +1,48 @@
# DH
This is the project for course {ENTERCOUSENUMBER} of Dr. Renana Keidar
This is the project for course 33503 of Dr. Renana Keidar
Project, By Benny Saret
# דו"ח התקדמות
## מטרות
מטרת הפרוייקט היא לייצר דרך למצוא קרבה או אינטראקסטואליות בין טקסטים שונים באכדית בין תקופות שונות, סוגות שונות ומרחקים גיאוגרפיים. [אינטרטקסטואליות](https://www.merriam-webster.com/dictionary/intertextually) הוא מונח המתאר מערכת קרבה וקשר בין טקסט מסויים לטקסטים אחרים, המשתמשים כחומר מצע, התכתבות, או ויכוח לאותו טקסט. את אותה קרבה ניתן לראות בעזרת מינוחים דומים, דימויים דומים, שיבוצי כתובים ועוד.
## נתוני מקור
נתוני המקור כולם נלקחו מפרוייקט ORACC [The Open Richly Annotated Cuneiform Corpus](http://oracc.museum.upenn.edu/ "ORACC, (לקמן, אוראקק)"). פרוייקט זה, הוא הפרוייקט הגדול והמקיף ביותר של טקסטים בכתב יתדות, פתוחים ונגישים לשימוש לקהל הרחב, ולחוקרים מכול הסוגים. הנתונים מגיעים בפורמטי JSON,TEI,XML ו־HTML, ומתעדכנים בכול עת.
בפרוייקט ישנם לא רק טקסטים באכדית, אלא גם טקסטים באוררטית, שומרית וכן גם טקסטים בשפות משולבות של איזורי סְפָֿר.
## אופן העבודה
### איסוף הנתונים
<style>
ul{
align: right;
direction: rtl;
}
li{
align: right;
direction: rtl;
}
</style>
השלב הראשון בפרוייקט היה איסוף הנתונים מאוראקק. תת השלבים של האיסוף היו:
1. הקמת נתונון לשמירה של המידע הנאסף. הנתונון שנבחר היה postgresql, נתונון יחסי המממש את שפת SQL.
1. יצירת טבלאות להכנסת הנתונים. לשם כך נוצרו הטבלאות הבאות
- סוגה: טבלא בשם סוגה (genre) שמרה בתוכה את הסוגה של כול טקסט, לפי קוד הטקסט. [Genre](https://dh.saret.tk/dh/api/ggenre)
- פרוייקט: טבלא בשם פרוייקט (project) שמרה בתוכה את כלל שמות הפרוייקטים ותתי הפרוייקטים. טבלא זו נדרש בעיקר בשלב גרידת הטקסטים.[Project](https://dh.saret.tk/dh/api/gprojects)
- תעתיק: טבלא בשם new כללה את התעתיק המפוצל לאכדית, יחד עם המזהה של הטקסט, על מנת להצמיד ביניהם בהמשך. [New](https://dh.saret.tk/dh/api/gnew)
- תרגום: טבלא נוספת הייתה טבלא בשם raw_texts שמטרתה הייתה להחזיק את כלל התרגומים של הטקסטים. [Jsons](https://dh.saret.tk/dh/api/gjson)
- ניתן לראות את כלל הקישרוים ב[קישורים](https://dh.saret.tk/dh/api/links)
1. כתיבת קוד פייתון אשר יוריד את כלל המידע, ויכניס אותו לנתונון.
### עיבוד הנתונים
השלב הבא, לאחר איסוף הנתונים, הוא שלב העיבוד. שלב זה היה יחסית מאתגר. לאחר חודשים שבהם ניסיתי להריץ מספר מודלים פשוטים כגון Word2Vec, TF-IDF, Doc2Vec ועוד, התקבלו תוצאות מוזרות, של קשרים שהתאימו רק בין טקסט לבין עצמו, התאמה של 1, והשאר, היו על התאמה של 0.
לאחר מספר חודשים של ניסיונות, ונטישות, פניתי לעזרת פורום פייסבוק בקבוצת MDLI, שם הציעו לי מחדש ללכת על מודלים פשוטים, ואף שלחו לי מספר קישורים מתוך medium ([TF-IDF Vectorizer scikit-learn](https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a) ו־[Understanding TF-IDF and Cosine Similarity for Recommendation Engine](https://medium.com/geekculture/understanding-tf-idf-and-cosine-similarity-for-recommendation-engine-64d8b51aa9f9) ), והייתה לי התקדמות במודל. ואולם, על אף שהצליחו לצאת לי תוצאות, לא הצלחתי לייצר גרף מהווקטורים הללו.
### הדגמת תוצאות
שני טקסטים שנמצאו בעלי קרבה של כ־87% הם למשל, [P394767](http://oracc.iaas.upenn.edu/btto/P394767/html) ו־[P395011](http://oracc.iaas.upenn.edu/btto/P395011/html). לאחר בדיקה קצרה של הטקסטים הללו, גם לעיניים שלי, הם נראו דומים. ובאמת, שני הטקסטים הללו מגיעים מאותה רשימה קאנונית המכונה "House most high". באוראקק אין כול אזכור ש־P394767 הוא מתוך הרשימה ההיא, אך המודל מצא את הדמיון, והעלה זאת לבדו.
# סיכום
בסופו של דבר, המודל הצליח להציג תוצאות טובות, אך עדיין לא מספקות. על כן, יש צורך בעבודה נוספת על המודל, ובפרט על הנתונים שהוכנסו למודל. כמו כן, יש צורך בעבודה על הגרף עצמו, ובפרט על הצגתו למשתמש באופן נוח וידידותי. המודל, והשיטה יכולים להוות התקדמות למחקר עתידי, לפיתוחו ולשימוש להבנת האכדית בצורה טובה יותר.

Binary file not shown.

1
adsd

File diff suppressed because one or more lines are too long

1
aemw

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1
amgg
View File

@@ -1 +0,0 @@
[]

1
asbp

File diff suppressed because one or more lines are too long

1
atae

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1
blms

File diff suppressed because one or more lines are too long

1
btmao
View File

@@ -1 +0,0 @@
[]

1
btto
View File

@@ -1 +0,0 @@
[]

1
cams

File diff suppressed because one or more lines are too long

1
ccpo

File diff suppressed because one or more lines are too long

1
cdli

File diff suppressed because one or more lines are too long

1
ckst

File diff suppressed because one or more lines are too long

1
cmawro

File diff suppressed because one or more lines are too long

View File

@@ -1 +0,0 @@
[]

File diff suppressed because one or more lines are too long

147
datat.ipynb Normal file
View File

@@ -0,0 +1,147 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import sklearn\n",
"import sklearn.model_selection\n",
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
"import pandas as pd\n",
"import scipy\n",
"import numpy as np\n",
"\n",
"df_eng = pd.read_csv('raw_texts.csv')\n",
"df_akk = pd.read_csv('new.csv')\n",
"# akk_raw_train, akk_raw_test = sklearn.model_selection.train_test_split(df_akk, test_size=0.2, random_state=0)\n",
"# eng_raw_train, eng_raw_test = sklearn.model_selection.train_test_split(df_eng, test_size=0.2, random_state=0)\n",
"tf_vectorizer = TfidfVectorizer(analyzer='word')\n",
"# tf_vectorizer.fit(akk_raw_train['Text'].to_list())"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tf_vectorizer = TfidfVectorizer(analyzer='word')\n",
"save_vect = tf_vectorizer.fit_transform(df_akk['Text'].dropna().to_list())\n",
"# save_vect = tf_vectorizer.fit_transform(['The sun in the sky is bright', 'We can see the shining sun, the bright sun.'])\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"tfidf_tokens = tf_vectorizer.get_feature_names_out()\n",
"df_tfidfvect = pd.DataFrame(data=save_vect.toarray(), columns=tfidf_tokens)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"test_mat = tf_vectorizer.transform(df_akk['Text'].dropna().to_list())\n",
"cc = cosine_similarity(save_vect,save_vect)\n",
"bool_similarity = cc > 0.5\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"abcd = np.where((cc > 0.5)&( cc< 1))\n",
"abcd[0].tofile(\"data.csv\", sep = \",\", format = \"%d\")"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Using matplotlib backend: <object object at 0x00000212CB626CA0>\n"
]
}
],
"source": [
"%matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f = sns.scatterplot(bool_similarity)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Project P394767\n",
"Text x x x BAD₃-ku-ri-gal-zi x E₂ 44 ša₂ BAD₃-{d}su...\n",
"Genre lexical\n",
"Name: 4, dtype: object"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_akk.iloc[4,:]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.13"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

1
dcclt

File diff suppressed because one or more lines are too long

1
dccmt

File diff suppressed because one or more lines are too long

1
doc
View File

@@ -1 +0,0 @@
[]

1
dsst

File diff suppressed because one or more lines are too long

1
ecut

File diff suppressed because one or more lines are too long

1
eisl

File diff suppressed because one or more lines are too long

1
epsd2
View File

@@ -1 +0,0 @@
[]

1
etcsri

File diff suppressed because one or more lines are too long

1
glass

File diff suppressed because one or more lines are too long

1
hbtin

File diff suppressed because one or more lines are too long

View File

@@ -1 +1 @@
C:/Users/Saret/Neutral Folder/dh/
C:/Users/Saret/WaitForIt/oracc/

1
lacost

File diff suppressed because one or more lines are too long

View File

@@ -1 +0,0 @@
[]

4114
missing_list.txt Normal file

File diff suppressed because it is too large Load Diff

1
neo
View File

@@ -1 +0,0 @@
[]

1
nere

File diff suppressed because one or more lines are too long

1
nimrud
View File

@@ -1 +0,0 @@
[]

1
obel
View File

@@ -1 +0,0 @@
[{"id_text": "P345452", "project_name": "obel", "raw_text": "Pure barge of the heavens, you are authoritative all on your own. Father Nanna, lord of Ur. Father Nanna, lord of the Eki\u0161nu\u014bal. Father Nanna, lord Dilimbabbar. Lord Nanna, foremost son of Enlil. When you float, when you float, When you appear authoritatively before your father, before Enlil, Father Nanna, when you appear authoritatively, when you raise your chest, When you appear authoritatively in your barge which is floating through the midst of heavens, Father Nanna, you, when you ride to your pure shrine, Father Nanna, when you float like a boat in a flood wave, When you float, when you float, you, when you float, When you float, when you pour out beer, you, when you float, When you pour out beer in a joyful mood, you, when you float, Father Nanna, when you tend to the ur cows and \u0161ar cows, Your father (Enlil) looks upon you with joyful eyes, and tends to you truly. Behold, he shines forth for the king; Enlil entrusted the sceptre of a lengthy reign to your hands. When you take care of lord Nudimmud, ... Having filled water into the .. canal ... Having filled water into the .. canal ... Having filled water into the Tigris, it is Nanna's. Having filled water into the Euphrates, it is Nanna's. Having filled water into canal and ditch for purification, they are Nanna's. Having filled the great marsh and the small marsh with water, they are Nanna's. An er\u0161ema song of Suen."}, {"id_text": "P355693", "project_name": "obel", "raw_text": "Oh my brother! ... Oh my brother! ... Oh my brother, son of Ga\u0161anmah! I lament for my brother, I lament, I lament in every way. I lament, the song of youthfulness I lament, in crying for the ... man She makes the woes plentyful, she makes the woes plentiful, standing up she makes the woes plentiful, young man, your mother makes the woes plentiful, your mother, Ninhursa\u014b, makes the woes plentiful. Our Princess in the Emah, the princess makes the woes plentiful. Atutur, the minister with hair hanging down makes the woes plentiful. My brother, you mother makes the woes plentiful. The palace of Ke\u0161 makes the woes plentiful. The brickwork of Iri\u0161ar makes the woes plentiful. The Emah of Adab makes the woes plentiful. The brickwork of Adab makes the woes plentiful. 'Where shall my son be handed over?' she is saying. 'Where shall my son, the Foolish One, be handed over?' she is saying. 'Where shall my son, the one I love, be handed over?' she is saying. The spouse calls out to her man. My brother, rise from your bed, may your mother rejoice over you. Your mother, Ga\u0161anhursa\u014b, may your mother rejoice over you. The en-priest, the lord, the great ruler of Adab may he rejoice over you. A\u0161irgi, the lord of Ke\u0161, may he rejoice over you. Atutur, in mourning, may she rejoice over you. Damgalnuna, of the Ema\u1e2b, the princess, may she rejoice over you. Lisin, the one of liver and heart, may she rejoice over you. ... ... ... ... ... Let me hear your sweet lips, let me hear your sweet voice let (my) heart be close to your good looks. Young man, do not let your mother, sit in tears, do not let your mother, Ninhursa\u014b, sit moaning, do not let Our Princess, sit (witnessing) your pain, do not let them do \"ua!\" Rise from your bed! Foolish One, do not let them do \"ua!\" Rise from your bed! The brother replied to his sister: My release, my sister, my release, Our Princess, my release, my sister, my release, Oh sister, do not speak so much, I am not one who can see. Our Princess, do not speak so much, I am not one who can see, My mother, Ga\u0161anmah, do not speak so much, I am not one who can see. In my bed, the dust of the netherworld, the ... lie with me. In my sleep, terror, the enemy sits with me. My sister, when I lie down and when I do not rise, my mother is the one who is anguished(?) over me, may I loosen the silah. Ga\u0161anhursa\u014b is the one who is anguished over me, may I loosen the simlah. My sister, stand up, give me my share, the estate of my father. My father made the woes over me plentiful, that be my share. Let my mother let her hair hang down for me, so that my ribs may lay down. May the bride whom my father (chose for me) measure grain for me, so that I may listen to it. Acquire a bed for me, (and recite) \"Its spirit is blown off.\" Set up the throne, seat the silah. Place the clothes on the throne, cover the simlah. Make funerary offerings, turn, accept them for me. Pour water into the libation pipe, and stir in the dust of the netherworld. Pour out the hot soup, let me drink its radiance. My sister, alas! Where ...? Our Princess ... Tears ... ... ... ... ... ..."}]

1
obmc

File diff suppressed because one or more lines are too long

1
obta
View File

@@ -1 +0,0 @@
[]

1
ogsl
View File

@@ -1 +0,0 @@
[]

1
oimea
View File

@@ -1 +0,0 @@
[]

1
pnao
View File

@@ -1 +0,0 @@
[]

View File

@@ -51,4 +51,4 @@ saao
suhu
tcma
tsae
xcat
xcat

File diff suppressed because it is too large Load Diff

1
qcat
View File

@@ -1 +0,0 @@
[]

15
report.html Normal file
View File

@@ -0,0 +1,15 @@
<html>
<head>
<title>דו"ח התקדמות</title>
<style>
</style>
</head>
<body dir="rtl">
<h1>דו"ח התקדמות
<h2>מטרות
<p>מטרת הפרוייקט היא לייצר דרך למצוא קרבה או אינטראקסטואליות בין טקסטים שונים באכדית על מנת </p>
</h2>
</h1>
</body>
</html>

116
requirements.txt Normal file
View File

@@ -0,0 +1,116 @@
aiofiles==22.1.0
aiosqlite==0.18.0
anyio==3.6.2
argon2-cffi==21.3.0
argon2-cffi-bindings==21.2.0
arrow==1.2.3
asttokens==2.2.1
attrs==22.2.0
autopep8==2.0.2
Babel==2.12.1
backcall==0.2.0
bcrypt==4.0.1
beautifulsoup4==4.12.2
bleach==6.0.0
bs4==0.0.1
certifi==2022.12.7
cffi==1.15.1
charset-normalizer==3.1.0
colorama==0.4.6
comm==0.1.3
cryptography==40.0.1
debugpy==1.6.7
decorator==5.1.1
defusedxml==0.7.1
entrypoints==0.4
executing==1.2.0
fastjsonschema==2.16.3
fqdn==1.5.1
idna==3.4
importlib-metadata==6.3.0
ipykernel==6.22.0
ipython==8.12.0
ipython-genutils==0.2.0
isoduration==20.11.0
jedi==0.18.2
Jinja2==3.1.2
json5==0.9.11
jsonpointer==2.3
jsonschema==4.17.3
jupyter-contrib-core==0.4.2
jupyter-contrib-nbextensions==0.7.0
jupyter-events==0.6.3
jupyter-highlight-selected-word==0.2.0
jupyter-kite==2.0.2
jupyter-latex-envs==1.4.6
jupyter-nbextensions-configurator==0.6.1
jupyter-ydoc==0.2.3
jupyter_client==8.1.0
jupyter_core==5.3.0
jupyter_server==2.5.0
jupyter_server_fileid==0.9.0
jupyter_server_terminals==0.4.4
jupyter_server_ydoc==0.8.0
jupyterlab==3.6.3
jupyterlab-execute-time==2.3.1
jupyterlab-pygments==0.2.2
jupyterlab_server==2.22.0
lxml==4.9.2
MarkupSafe==2.1.2
matplotlib-inline==0.1.6
mistune==2.0.5
nbclassic==0.5.5
nbclient==0.7.3
nbconvert==7.3.1
nbformat==5.8.0
nest-asyncio==1.5.6
notebook==6.5.4
notebook_shim==0.2.2
numpy==1.24.2
packaging==23.0
pandas==2.0.0
pandocfilters==1.5.0
paramiko==3.1.0
parso==0.8.3
pickleshare==0.7.5
platformdirs==3.2.0
prometheus-client==0.16.0
prompt-toolkit==3.0.38
psutil==5.9.4
psycopg2==2.9.6
pure-eval==0.2.2
pycodestyle==2.10.0
pycparser==2.21
Pygments==2.15.0
PyNaCl==1.5.0
pyrsistent==0.19.3
python-dateutil==2.8.2
python-json-logger==2.0.7
pytz==2023.3
PyYAML==6.0
pyzmq==25.0.2
requests==2.28.2
rfc3339-validator==0.1.4
rfc3986-validator==0.1.1
Send2Trash==1.8.0
six==1.16.0
sniffio==1.3.0
soupsieve==2.4
sshtunnel==0.4.0
stack-data==0.6.2
terminado==0.17.1
tinycss2==1.2.1
tomli==2.0.1
tornado==6.2
traitlets==5.9.0
typing_extensions==4.5.0
tzdata==2023.3
uri-template==1.2.0
urllib3==1.26.15
wcwidth==0.2.6
webcolors==1.13
webencodings==0.5.1
websocket-client==1.5.1
y-py==0.5.9
ypy-websocket==0.8.2
zipp==3.15.0

1
riao
View File

@@ -1 +0,0 @@
[]

1
ribo

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

1
rinap

File diff suppressed because one or more lines are too long

1
saao

File diff suppressed because one or more lines are too long

7497
scrape.log

File diff suppressed because it is too large Load Diff

View File

@@ -6,4 +6,4 @@ import psycopg2
# conn = psycopg2.connect("dbname='dh' user='dh' host='dh.saret.tk' password='qwerty'")
# return conn
def
# def

View File

@@ -1,7 +1,7 @@
import json
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, ResultSet
import os
from pathlib import Path
import re
@@ -32,9 +32,25 @@ def _load_json_from_path(json_path: str) -> Dict:
return json.load(json_file)
def _download_data_from_website(url: str) -> ResultSet:
try:
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
return soup.find_all("span", {"class": "cell"})
except Exception as e:
logging.error(e)
return list()
def _clean_raw_text(results: ResultSet) -> str:
return " ".join(["".join([content if isinstance(content, str) else content.text
for content in result.contents]) for result in results]).replace('\n', ' ')
def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'oracc.museum.upenn.edu') -> List[Dict]:
raw_jsons = list()
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True) + glob.glob(
f'jsons_unzipped/{project_dirname}/corpusjson/*.json', recursive=True)
# path = Path(os.path.join(JSONS_DIR, project_dirname, 'catalogue.json'))
# if not os.path.isfile(path):
# return raw_jsons
@@ -43,8 +59,11 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
# for member in d.get('members').values():
for filename in all_paths:
cur_json = _load_json_from_path(filename)
project_name = cur_json['project']
try:
project_name = cur_json['project']
except TypeError:
logging.error(f"Error in {filename}")
continue
# # Skip in case we are in saa project and the current sub project is not in neo-assyrian
# if project_dirname == "saao" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate
# continue
@@ -55,12 +74,7 @@ def get_raw_english_texts_of_project(project_dirname: str, oracc_site: str = 'or
# print(url)
logging.info(url)
try:
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
results = soup.find_all("span", {"class": "cell"})
raw_text = " ".join(["".join([content if isinstance(content, str) else content.text
for content in result.contents]) for result in results])
raw_text = raw_text.replace('\n', ' ')
raw_text = _clean_raw_text(_download_data_from_website(url))
if raw_text:
raw_jsons.append({
"id_text": cur_json['textid'],
@@ -128,13 +142,13 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
:return: A list of jsons containing the raw texts of the given project and basic metadata.
"""
raw_jsons = list()
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)
all_paths = glob.glob(f'jsons_unzipped/{project_dirname}/**/corpusjson/*.json', recursive=True)+glob.glob(
f'jsons_unzipped/{project_dirname}/corpusjson/*.json', recursive=True)
for filename in all_paths:
cur_json = _load_json_from_path(filename)
try:
project_name = cur_json['project']
sents_dicts = cur_json['cdl'][0]['cdl'][-1]['cdl']
except Exception as e:
print(f"In file {filename} failed because of {e}")
@@ -143,8 +157,7 @@ def get_raw_akk_texts_of_project(project_dirname: str) -> List[Dict]:
raw_text = get_raw_akk_text_from_json(sents_dicts)
raw_jsons.append({
"id_text": cur_json['textid'],
"project_name": project_name,
"raw_text": raw_text,
"raw_text": raw_text
})
# if not texts_jsons or not texts_jsons.get('members'):
@@ -198,7 +211,7 @@ def _get_raw_text(json_dict: dict) -> str:
raw_texts.extend(_get_raw_text(d['cdl']).split())
elif _is_word(d): # If node represents a word
if previous_ref != d.get('ref'): # If encountered new instance:
cur_text = d['frag'] if d.get('frag') else d['f']['form']
cur_text = d['f']['norm'] if d['f'].get('norm') else d['f']['form']
raw_texts.append(cur_text + _get_addition(d))
previous_ref = d.get('ref')

1
suhu

File diff suppressed because one or more lines are too long

1
tcma
View File

@@ -1 +0,0 @@
[]

1
tsae
View File

@@ -1 +0,0 @@
[]

1
xcat
View File

@@ -1 +0,0 @@
[]