Fixed the bug with cerberus_web_client.py by working with Selenium. To login each chain working with it must have a username for login with Selenium. in this mechanism, a path to a gz file is returned instead of url

Added the option to output a prices json file in main.py under --prices-with-promos, where the prices are updated by the latest promotions (under the 'final_price' key, where 'price' represents the price before promotions). Fixed small bug of BinaWebCleint by checking that filename does not contain 'null'. Changed Hierarchy of chains such that it includes the webclients. Added the date to the output filenames to start storing the data over time. Black formatting (according to pip 8 guidelines). Changed the chains_dict in main to a constant one.
2022-10-04 11:42:36 +03:00
parent b5db721a3d
commit ceff48dbd9
28 changed files with 796 additions and 406 deletions
--- a/chains/bareket.py
+++ b/chains/bareket.py
@@ -1,6 +1,5 @@
 from chains.mahsaneiHashook import MahsaneiHashook
-from supermarket_chain import SupermarketChain


-class Bareket(MahsaneiHashook, SupermarketChain):
+class Bareket(MahsaneiHashook):
    pass
--- a/chains/binaproject_web_client.py
+++ b/chains/binaproject_web_client.py
@@ -8,14 +8,16 @@ from supermarket_chain import SupermarketChain
 FNAME_KEY = "FileNm"


-class BinaProjectWebClient:
+class BinaProjectWebClient(SupermarketChain):
    _date_hour_format = '%Y-%m-%d %H:%M:%S'
    _update_date_format = '%Y-%m-%d %H:%M:%S'
    _path_prefix = ""
    _hostname_suffix = ".binaprojects.com"

-    def get_download_url(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
+    def get_download_url_or_path(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
            -> str:
+        if not SupermarketChain.is_valid_store_id(store_id):
+            raise ValueError(f"Invalid {store_id=} (store id must be a natural number)")
        hostname = f"http://{self.hostname_prefix}{self.hostname_suffix}"
        url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"])
        req_res: requests.Response = session.get(url)
@@ -27,7 +29,7 @@ class BinaProjectWebClient:
            if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files):
                return ""  # Could not find non-full Promos/Prices file
        else:
-            filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname
+            filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname and 'null' not in fname
        suffix = next(
            cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY]))
        down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix])
--- a/chains/cerberus_web_client.py
+++ b/chains/cerberus_web_client.py
@@ -1,35 +1,99 @@
-import json
-import re
+import logging
+import os
+import shutil
+import time
+from abc import abstractmethod

 import requests
+from selenium import webdriver
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager

 from supermarket_chain import SupermarketChain


-class CerberusWebClient:
-
-    def get_download_url(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
-            -> str:
-        hostname: str = "https://publishedprices.co.il"
-
-        # Post the payload to the site to log in
-        session.post(hostname + "/login/user", data={'username': self.username})
-
-        # Scrape the data
-        ajax_dir_payload: dict = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
-        s: requests.Response = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
-        s_json: dict = json.loads(s.text)
-        if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
-            filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] and not re.search('full', d['name'], re.IGNORECASE)
-            if not any(filter_func(d, store_id) for d in s_json['aaData']):
-                return ""  # Could not find non-full Prices/Promos file
-        else:
-            filter_func = lambda d, id: f'-{id:03d}-20' in d['name']
-        suffix: str = next(d['name'] for d in s_json['aaData'] if filter_func(d, store_id))
-
-        download_url: str = hostname + "/file/d/" + suffix
-        return download_url
-
+class CerberusWebClient(SupermarketChain):
    @property
+    @abstractmethod
    def username(self):
-        return repr(type(self))
+        pass
+
+    def get_download_url_or_path(
+        self,
+        store_id: int,
+        category: SupermarketChain.XMLFilesCategory,
+        session: requests.Session,
+    ) -> str:
+        options = webdriver.ChromeOptions()
+        options.add_argument("ignore-certificate-errors")
+        options.add_argument("--ignore-ssl-errors=yes")
+
+        driver = webdriver.Chrome(
+            service=Service(ChromeDriverManager().install()), options=options
+        )
+
+        driver.get("https://url.retail.publishedprices.co.il/login#")
+        time.sleep(2)
+        userElem = driver.find_element(By.NAME, "username")
+        userElem.send_keys(self.username)
+        driver.find_element(By.NAME, "Submit").click()
+        time.sleep(2)
+
+        searchElem = driver.find_element(By.CLASS_NAME, "form-control")
+        searchElem.send_keys(category.value)
+        time.sleep(5)
+
+        conns = driver.find_elements(By.CLASS_NAME, "f")
+        best_link = ""
+        for conn in conns:
+            link = conn.get_attribute("href").lower()
+            if category == SupermarketChain.XMLFilesCategory.Promos:
+                filter_func = (
+                    lambda l: "promo" in l
+                    and "full" not in l
+                    and f"-{store_id:03d}-20" in l
+                )
+            elif category == SupermarketChain.XMLFilesCategory.PromosFull:
+                filter_func = (
+                    lambda l: "promo" in l
+                    and "full" in l
+                    and f"-{store_id:03d}-20" in l
+                )
+            elif category == SupermarketChain.XMLFilesCategory.Prices:
+                filter_func = (
+                    lambda l: "price" in l
+                    and "full" not in l
+                    and f"-{store_id:03d}-20" in l
+                )
+            elif category == SupermarketChain.XMLFilesCategory.PricesFull:
+                filter_func = (
+                    lambda l: "price" in l
+                    and "full" in l
+                    and f"-{store_id:03d}-20" in l
+                )
+            elif category == SupermarketChain.XMLFilesCategory.Stores:
+                filter_func = lambda l: "store" in l and "full" in l and f"-000-20" in l
+            else:
+                raise ValueError(f"Unknown category type: {category=}")
+
+            if filter_func(link):
+                if not best_link or int(link[-7:-3]) > int(best_link[-7:-3]):
+                    best_link = link
+
+        if not best_link:
+            return ""
+        driver.get(best_link)
+        time.sleep(3)
+        download_dir = "/Users/korenlazar/Downloads"
+        filename = best_link[48:]
+        path_download = os.path.join(download_dir, filename)
+        logging.info(f"{path_download=}")
+        path_to_save = f"raw_files/{self.username}-{filename}"
+        try:
+            shutil.move(path_download, path_to_save)
+            print(f"Downloaded {filename} and moved file to {path_to_save}")
+        except:
+            print(f"{filename} already exists in {path_to_save}")
+
+        return path_to_save
--- a/chains/co_op.py
+++ b/chains/co_op.py
@@ -1,6 +1,5 @@
 from chains.mahsaneiHashook import MahsaneiHashook
-from supermarket_chain import SupermarketChain


-class CoOp(MahsaneiHashook, SupermarketChain):
+class CoOp(MahsaneiHashook):
    pass
--- a/chains/dor_alon.py
+++ b/chains/dor_alon.py
@@ -1,6 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class DorAlon(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class DorAlon(CerberusWebClient):
+    @property
+    def username(self):
+        return "doralon"
+
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
--- a/chains/freshmarket.py
+++ b/chains/freshmarket.py
@@ -1,6 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class Freshmarket(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class Freshmarket(CerberusWebClient):
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
+
+    @property
+    def username(self):
+        return "freshmarket"
--- a/chains/hazi_hinam.py
+++ b/chains/hazi_hinam.py
@@ -1,6 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class HaziHinam(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class HaziHinam(CerberusWebClient):
+    @property
+    def username(self):
+        return "HaziHinam"
+
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
--- a/chains/keshet.py
+++ b/chains/keshet.py
@@ -1,6 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class Keshet(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class Keshet(CerberusWebClient):
+    @property
+    def username(self):
+        return "Keshet"
+
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
--- a/chains/king_store.py
+++ b/chains/king_store.py
@@ -1,7 +1,6 @@
 from chains.binaproject_web_client import BinaProjectWebClient
-from supermarket_chain import SupermarketChain


-class KingStore(BinaProjectWebClient, SupermarketChain):
+class KingStore(BinaProjectWebClient):
    _path_prefix = "Food_Law"
    _hostname_suffix = ".co.il"
--- a/chains/maayan2000.py
+++ b/chains/maayan2000.py
@@ -1,6 +1,5 @@
 from chains.binaproject_web_client import BinaProjectWebClient
-from supermarket_chain import SupermarketChain


-class Maayan2000(BinaProjectWebClient, SupermarketChain):
-    pass
+class Maayan2000(BinaProjectWebClient):
+    pass
--- a/chains/mahsaneiHashook.py
+++ b/chains/mahsaneiHashook.py
@@ -1,5 +1,6 @@
 import re
 from typing import Dict, List
+
 import requests
 from bs4 import BeautifulSoup
 from bs4.element import Tag
@@ -9,33 +10,46 @@ from supermarket_chain import SupermarketChain


 class MahsaneiHashook(SupermarketChain):
-    _promotion_tag_name = 'Sale'
-    _promotion_update_tag_name = 'PriceUpdateDate'
-    _date_format = '%Y/%m/%d'
-    _date_hour_format = '%Y/%m/%d %H:%M:%S'
-    _update_date_format = '%Y/%m/%d %H:%M:%S'
-    _item_tag_name = 'Product'
+    _promotion_tag_name = "Sale"
+    _promotion_update_tag_name = "PriceUpdateDate"
+    _date_format = "%Y/%m/%d"
+    _date_hour_format = "%Y/%m/%d %H:%M:%S"
+    _update_date_format = "%Y/%m/%d %H:%M:%S"
+    _item_tag_name = "Product"

    @staticmethod
-    def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
+    def get_download_url_or_path(
+        store_id: int,
+        category: SupermarketChain.XMLFilesCategory,
+        session: requests.Session,
+    ) -> str:
        prefix = "http://matrixcatalog.co.il/"
        url = prefix + "NBCompetitionRegulations.aspx"
        req_res: requests.Response = requests.get(url)
-        soup = BeautifulSoup(req_res.text, features='lxml')
-        if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
-            fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
-                                              and f'-{store_id:03d}-20' in fname \
-                                              and not re.search('full', fname, re.IGNORECASE)
-            if soup.find('a', href=fname_filter_func) is None:
+        soup = BeautifulSoup(req_res.text, features="lxml")
+        if category in [
+            SupermarketChain.XMLFilesCategory.Promos,
+            SupermarketChain.XMLFilesCategory.Prices,
+        ]:
+            fname_filter_func = (
+                lambda fname: fname
+                and category.name.replace("s", "") in fname
+                and f"-{store_id:03d}-20" in fname
+                and not re.search("full", fname, re.IGNORECASE)
+            )
+            if soup.find("a", href=fname_filter_func) is None:
                return ""  # Could not find non-full Promos/Prices file
        else:
-            fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
-                                              and f'-{store_id:03d}-20' in fname
-        suffix: str = soup.find('a', href=fname_filter_func).attrs['href']
+            fname_filter_func = (
+                lambda fname: fname
+                and category.name.replace("s", "") in fname
+                and f"-{store_id:03d}-20" in fname
+            )
+        suffix: str = soup.find("a", href=fname_filter_func).attrs["href"]
        down_url: str = prefix + suffix
        return down_url

    @staticmethod
    def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
-        promo_item = items_dict.get(promo.find('ItemCode').text)
+        promo_item = items_dict.get(promo.find("ItemCode").text)
        return [promo_item] if promo_item else []
--- a/chains/osher_ad.py
+++ b/chains/osher_ad.py
@@ -1,6 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class OsherAd(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class OsherAd(CerberusWebClient):
+    @property
+    def username(self):
+        return "osherad"
+
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
--- a/chains/rami_levi.py
+++ b/chains/rami_levi.py
@@ -1,6 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class RamiLevi(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class RamiLevi(CerberusWebClient):
+    @property
+    def username(self):
+        return "RamiLevi"
+
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
--- a/chains/shefa_birkat_hashem.py
+++ b/chains/shefa_birkat_hashem.py
@@ -1,6 +1,5 @@
 from chains.binaproject_web_client import BinaProjectWebClient
-from supermarket_chain import SupermarketChain


-class ShefaBirkatHashem(BinaProjectWebClient, SupermarketChain):
-    pass
+class ShefaBirkatHashem(BinaProjectWebClient):
+    pass
--- a/chains/shuk_hayir.py
+++ b/chains/shuk_hayir.py
@@ -1,7 +1,7 @@
 from chains.binaproject_web_client import BinaProjectWebClient
-from supermarket_chain import SupermarketChain


-class ShukHayir(BinaProjectWebClient, SupermarketChain):
+class ShukHayir(BinaProjectWebClient):
    @property
-    def hostname_prefix(self): return "shuk-hayir"
+    def hostname_prefix(self):
+        return "shuk-hayir"
--- a/chains/stop_market.py
+++ b/chains/stop_market.py
@@ -1,9 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class StopMarket(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class StopMarket(CerberusWebClient):
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
+
    @property
    def username(self):
-        return 'Stop_Market'
+        return "Stop_Market"
--- a/chains/tiv_taam.py
+++ b/chains/tiv_taam.py
@@ -1,6 +1,7 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class TivTaam(CerberusWebClient, SupermarketChain):
-    pass
+class TivTaam(CerberusWebClient):
+    @property
+    def username(self):
+        return "TivTaam"
--- a/chains/victory.py
+++ b/chains/victory.py
@@ -1,6 +1,5 @@
 from chains.mahsaneiHashook import MahsaneiHashook
-from supermarket_chain import SupermarketChain


-class Victory(MahsaneiHashook, SupermarketChain):
+class Victory(MahsaneiHashook):
    pass
--- a/chains/yeinot_bitan.py
+++ b/chains/yeinot_bitan.py
--- a/chains/yohananof.py
+++ b/chains/yohananof.py
@@ -1,6 +1,9 @@
 from chains.cerberus_web_client import CerberusWebClient
-from supermarket_chain import SupermarketChain


-class Yohananof(CerberusWebClient, SupermarketChain):
-    _date_hour_format = '%Y-%m-%d %H:%M:%S'
+class Yohananof(CerberusWebClient):
+    @property
+    def username(self):
+        return "yohananof"
+
+    _date_hour_format = "%Y-%m-%d %H:%M:%S"
--- a/chains/zol_vebegadol.py
+++ b/chains/zol_vebegadol.py
@@ -1,6 +1,5 @@
 from chains.binaproject_web_client import BinaProjectWebClient
-from supermarket_chain import SupermarketChain


-class ZolVebegadol(BinaProjectWebClient, SupermarketChain):
+class ZolVebegadol(BinaProjectWebClient):
    pass