From 6380a9fa20fc594168b3f175a6ff2d74d4758ac4 Mon Sep 17 00:00:00 2001 From: 1kamma Date: Wed, 12 Apr 2023 01:43:03 +0300 Subject: [PATCH] t1 --- 2023-04-12_01-39-28.log | 5 +++++ __pycache__/scrapping.cpython-39.pyc | Bin 6802 -> 6969 bytes project_notebook.ipynb | 10 +++++----- scrape.log | 5 +++++ 4 files changed, 15 insertions(+), 5 deletions(-) create mode 100644 2023-04-12_01-39-28.log diff --git a/2023-04-12_01-39-28.log b/2023-04-12_01-39-28.log new file mode 100644 index 0000000..cb7db25 --- /dev/null +++ b/2023-04-12_01-39-28.log @@ -0,0 +1,5 @@ +root - INFO - http://oracc.iaas.upenn.edu/ario/Q006653/html +root - INFO - http://oracc.iaas.upenn.edu/ario/Q006654/html +root - INFO - http://oracc.iaas.upenn.edu/ario/Q006655/html +root - INFO - http://oracc.iaas.upenn.edu/ario/Q007129/html +root - INFO - http://oracc.iaas.upenn.edu/ario/Q007130/html diff --git a/__pycache__/scrapping.cpython-39.pyc b/__pycache__/scrapping.cpython-39.pyc index 55854c22f15ce08d7946c54b7804f5014d9b60ed..34c0e416c6a2232a9f81ad01a492503d7d8749ef 100644 GIT binary patch delta 2225 zcmZWqS#KLv6rO93?eV_Eabjn4nog2Q;4@mq19(aIxK!QFY_yq_d&be-p8!XK?XSwIhxo7hS zd%oE_ZA7CX34Z&2-9vvHlca~pbpHatEW%^Y-ObJ*;qd2lvL zw}E`b^q6@%2K;ua0zdj6KTaor-v)fI&`(kg_%TY(NZJ%XNoF^a>cxUuFQ~Mr9xtfN z1@+9jJZm+pE?HY2S0@`r-PG)SA+Jta=2g?`;_Ieu8&%M*CskTdO9l1Syn6hAx_m%A zqxoE)IPWTznq{I@hpIQJ>G~^8RyQ0sKnE9nE`Yl zys!bF+|ayLvz>CY-dZ@s-}RS9ccRqKOuKD4opZth53^kWhj=`2oec7K0uzZHcor_z zphd&7*c|^ppphZ|XCQY}5vp>V6)}3YA6ajkS*{mcyRzMEx7gfTT4HYboSWyP zfjk-Gma?Cm;-4vB?oGf!mPAM)pjX{%8^(HYW%ANx?d7Qzjm@A2J!(OgM!KKBA6!Ws zN3!ek{O@3P-~|xc=u{#yWKba?5+fP1o3IOfXQ&y5@sQYdgmHd9v};!Qo&a78uxC)b z2%v{;$8g$qu?+QO=TIyo?BJnrPJ0?j43wQfcpd>iD?)rCpylugInV3iEYbPf;WC-# ze}oTCV~#Zs6J_x_Mzz}nRBmv-YBtPkEw)hMrz2N(iYs33GLqMtjBc?Km8@bE2LC;> zUi9VWAQIjw$Qh#xGLs%8)l(_t~{-!)8U$>WX95TSX8O-fXp)aoM@WnR zk{X@G9?^s7N4V*>MOQezUn5}vtk3e#lR3WB)3+e%PMD3Vt^q-o0el6I?E#R8;9;icVcb znW$z|w3|U`&8fi-5pnw18joeB(pv&WM-TYP%t#a|n3oOo5{ERw^k>Fr7Jj9GQ%QnP6>%ooFpGmN?`BO*5gcu<*#zqYnzE)d{s+<=PMt`XR<5yBfe m|F{ARH%H`u{9f6kcw}!PlAct;N=)%9$@D=*PGk~tLjD(3$nm-W delta 2124 zcmZuxOKcNY6rDHzjeoIYJC2+N?)sPvZCFv=_aTZLaNkmTNW&;N~7*jrK+o{%P!j9`wUgEYU`bMKkweT?|$Bs zeV;7NTIqB`!QYopm-zlkMR`HY_Fov8Be=Yg2eTh+*s1Gj9zpEm(T`N7@R*UQD_ncX zj4Y2E{dUe)d16!LNuIhNHuA`aY_+WLG|KvT24w}F<^A^+V}R#)9`_Wi#KX<({db`7pF9RwnV}>Shq~Ni5J9CWwtGU)dPw?&}to!Dp(Yo z+Gu`>gv{RlE%Bo^#m2>-+5vW2%*7vPlXwPF1Zjdr@khLn+esu@>v6~sofpN#)tO^N z_T0PB>NL$;E^yCmv`o8I$26>-5}S!KyCA+!xXCdx+Cfke3&}n6()R@NdJHI2a0tOj zdcNg%JhSF<8_tuM5(qQm ztA5kDh{wGwf}-YF0{3R6mYtYncn9)hdFP4AvuCUU-OMAr!-GLPG>#CPdZzlFMPd$K+@ z#=sK4r=PM(@prK#zU`~by-O}w5V>hNo!5#@KDA}XYOZjrx{4frIGLH*c%R7kNE`Ax z?SS14*gcp=UQd7wY9Hys)P1m<&b~>Y6I7)}P6Oqr8-7$)BM4)nJR{;Y>Maip)Eb&^ zH`^wPgD^>7-mH%dW!GOqsxH3IOtDK_zhz#q{xYeC5H7(mk)z_5{=JoPBKII*%TQ?W z76DBu940s~7INb!DKzX?8WMP$;3xqN2FS*x1`3j%-SR!0=$h}rDbn@!koY`TVe8^a zZkD;?*W3Zw5e2asEsHbxLAEA%zM@lO7;*9?U2QKJ`J`!nOvF|3ReoPp)|f$yB(IMT zG*PyN%To~)CV6~%x=$5FiI2`iA*x2@*sO{9V(IM5O36&L(JF{yr`uSo8_2*pl4xeb zlT#_Dt6R0SNt0o5-tZf4t6RAUfcUgHbEGFKa`X)@kE*0-VHU+v(O3x=bzlyMMt%%O zq>eYQkc(hy}D)_4fgDXyC@2n1NU%;4+(ng)=~Bx*DJgAt!IM| z80!eLJiQ@PLZ*61e$>dUTAnG}uUlZO5O&@FR_HdG{X~xCa}X=yLy*PWb?*tJ)hm$4 zEr|NJ+l^Kos6Y@RpjXQXtt@+s-A9UC1TlJtoyH18=*SrY*`@nQnOw2FM(vo}y4Ju^ zIDPiSIdNrZX+utwY~#~({sO^e0-AWB;Q*I_nj{FzZlKBp;T6|)fN~0yAQ|qq%*h;5 h<#70$F7G&Yl_BRXAZYTMDZDB!s;L&rGBG+l@-NVe$ff`Q diff --git a/project_notebook.ipynb b/project_notebook.ipynb index c34a774..d7808ae 100644 --- a/project_notebook.ipynb +++ b/project_notebook.ipynb @@ -10,7 +10,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -39,8 +39,8 @@ "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[1;32mIn[17], line 5\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mproject_list\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[0;32m 4\u001b[0m \u001b[39mfor\u001b[39;00m project \u001b[39min\u001b[39;00m f\u001b[39m.\u001b[39mread()\u001b[39m.\u001b[39msplit(\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m'\u001b[39m):\n\u001b[1;32m----> 5\u001b[0m scrap \u001b[39m=\u001b[39m scrapping\u001b[39m.\u001b[39;49mget_raw_english_texts_of_project(project)\n\u001b[0;32m 6\u001b[0m \u001b[39m# connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\u001b[39;00m\n\u001b[0;32m 7\u001b[0m \u001b[39mtry\u001b[39;00m:\n", - "File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\scrapping.py:43\u001b[0m, in \u001b[0;36mget_raw_english_texts_of_project\u001b[1;34m(project_dirname)\u001b[0m\n\u001b[0;32m 41\u001b[0m \u001b[39mfor\u001b[39;00m filename \u001b[39min\u001b[39;00m all_paths:\n\u001b[0;32m 42\u001b[0m cur_json \u001b[39m=\u001b[39m _load_json_from_path(filename)\n\u001b[1;32m---> 43\u001b[0m project_name \u001b[39m=\u001b[39m cur_json[\u001b[39m'\u001b[39;49m\u001b[39mproject\u001b[39;49m\u001b[39m'\u001b[39;49m]\n\u001b[0;32m 45\u001b[0m \u001b[39m# # Skip in case we are in saa project and the current sub project is not in neo-assyrian\u001b[39;00m\n\u001b[0;32m 46\u001b[0m \u001b[39m# if project_dirname == \"saao\" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate\u001b[39;00m\n\u001b[0;32m 47\u001b[0m \u001b[39m# continue\u001b[39;00m\n\u001b[0;32m 48\u001b[0m \n\u001b[0;32m 49\u001b[0m \u001b[39m# id_text = member.get('id_text', \"\") + member.get('id_composite', \"\")\u001b[39;00m\n\u001b[0;32m 50\u001b[0m \u001b[39m# html_dir = \"/\".join(path.parts[1:-1])\u001b[39;00m\n\u001b[0;32m 51\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mhttp://oracc.iaas.upenn.edu/\u001b[39m\u001b[39m{\u001b[39;00mproject_name\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00mcur_json[\u001b[39m'\u001b[39m\u001b[39mtextid\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m}\u001b[39;00m\u001b[39m/html\u001b[39m\u001b[39m\"\u001b[39m\n", + "Cell \u001b[1;32mIn[2], line 6\u001b[0m\n\u001b[0;32m 3\u001b[0m \u001b[39mwith\u001b[39;00m \u001b[39mopen\u001b[39m(\u001b[39m'\u001b[39m\u001b[39mproject_list\u001b[39m\u001b[39m'\u001b[39m) \u001b[39mas\u001b[39;00m f:\n\u001b[0;32m 4\u001b[0m \u001b[39mfor\u001b[39;00m project \u001b[39min\u001b[39;00m f\u001b[39m.\u001b[39mread()\u001b[39m.\u001b[39msplit(\u001b[39m'\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m'\u001b[39m):\n\u001b[0;32m 5\u001b[0m \u001b[39m# connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\u001b[39;00m\n\u001b[1;32m----> 6\u001b[0m scrap \u001b[39m=\u001b[39m scrapping\u001b[39m.\u001b[39;49mget_raw_english_texts_of_project(project)\n\u001b[0;32m 7\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 8\u001b[0m execute_batch(connection, \u001b[39m\"\u001b[39m\u001b[39minsert into raw_texts values (\u001b[39m\u001b[39m%(id_text)s\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m%(project_name)s\u001b[39;00m\u001b[39m, \u001b[39m\u001b[39m%(raw_text)s\u001b[39;00m\u001b[39m)\u001b[39m\u001b[39m\"\u001b[39m, scrap)\n", + "File \u001b[1;32mc:\\Users\\Saret\\Programming\\C#\\DH\\scrapping.py:46\u001b[0m, in \u001b[0;36mget_raw_english_texts_of_project\u001b[1;34m(project_dirname)\u001b[0m\n\u001b[0;32m 44\u001b[0m \u001b[39mfor\u001b[39;00m filename \u001b[39min\u001b[39;00m all_paths:\n\u001b[0;32m 45\u001b[0m cur_json \u001b[39m=\u001b[39m _load_json_from_path(filename)\n\u001b[1;32m---> 46\u001b[0m project_name \u001b[39m=\u001b[39m cur_json[\u001b[39m'\u001b[39;49m\u001b[39mproject\u001b[39;49m\u001b[39m'\u001b[39;49m]\n\u001b[0;32m 48\u001b[0m \u001b[39m# # Skip in case we are in saa project and the current sub project is not in neo-assyrian\u001b[39;00m\n\u001b[0;32m 49\u001b[0m \u001b[39m# if project_dirname == \"saao\" and project_name[-2:] not in SUB_PROJECTS_IN_NEO_ASS: # TODO: validate\u001b[39;00m\n\u001b[0;32m 50\u001b[0m \u001b[39m# continue\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \n\u001b[0;32m 52\u001b[0m \u001b[39m# id_text = member.get('id_text', \"\") + member.get('id_composite', \"\")\u001b[39;00m\n\u001b[0;32m 53\u001b[0m \u001b[39m# html_dir = \"/\".join(path.parts[1:-1])\u001b[39;00m\n\u001b[0;32m 54\u001b[0m url \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mhttp://oracc.iaas.upenn.edu/\u001b[39m\u001b[39m{\u001b[39;00mproject_name\u001b[39m}\u001b[39;00m\u001b[39m/\u001b[39m\u001b[39m{\u001b[39;00mcur_json[\u001b[39m'\u001b[39m\u001b[39mtextid\u001b[39m\u001b[39m'\u001b[39m]\u001b[39m}\u001b[39;00m\u001b[39m/html\u001b[39m\u001b[39m\"\u001b[39m\n", "\u001b[1;31mTypeError\u001b[0m: 'NoneType' object is not subscriptable" ] } @@ -50,8 +50,8 @@ "connection = conn.cursor()\n", "with open('project_list') as f:\n", " for project in f.read().split('\\n'):\n", - " scrap = scrapping.get_raw_english_texts_of_project(project)\n", " # connection = connection.execute(\"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n", + " scrap = scrapping.get_raw_english_texts_of_project(project)\n", " try:\n", " execute_batch(connection, \"insert into raw_texts values (%(id_text)s, %(project_name)s, %(raw_text)s)\", scrap)\n", " conn.commit()\n", diff --git a/scrape.log b/scrape.log index 46ab031..25ae287 100644 --- a/scrape.log +++ b/scrape.log @@ -7490,3 +7490,8 @@ INFO:root:Start CRITICAL:root:fuck INFO:root:Start INFO:root:Start +INFO:root:http://oracc.iaas.upenn.edu/ario/Q006653/html +INFO:root:http://oracc.iaas.upenn.edu/ario/Q006654/html +INFO:root:http://oracc.iaas.upenn.edu/ario/Q006655/html +INFO:root:http://oracc.iaas.upenn.edu/ario/Q007129/html +INFO:root:http://oracc.iaas.upenn.edu/ario/Q007130/html