Fixed the bug with cerberus_web_client.py by working with Selenium. To login each chain working with it must have a username for login with Selenium. in this mechanism, a path to a gz file is returned instead of url

Added the option to output a prices json file in main.py under --prices-with-promos, where the prices are updated by the latest promotions (under the 'final_price' key, where 'price' represents the price before promotions).

Fixed small bug of BinaWebCleint by checking that filename does not contain 'null'.

Changed Hierarchy of chains such that it includes the webclients.

Added the date to the output filenames to start storing the data over time.

Black formatting (according to pip 8 guidelines).

Changed the chains_dict in main to a constant one.
This commit is contained in:
korenlazar
2022-10-04 11:42:36 +03:00
parent b5db721a3d
commit ceff48dbd9
28 changed files with 796 additions and 406 deletions

View File

@@ -1,6 +1,5 @@
from chains.mahsaneiHashook import MahsaneiHashook
from supermarket_chain import SupermarketChain
class Bareket(MahsaneiHashook, SupermarketChain):
class Bareket(MahsaneiHashook):
pass

View File

@@ -8,14 +8,16 @@ from supermarket_chain import SupermarketChain
FNAME_KEY = "FileNm"
class BinaProjectWebClient:
class BinaProjectWebClient(SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
_update_date_format = '%Y-%m-%d %H:%M:%S'
_path_prefix = ""
_hostname_suffix = ".binaprojects.com"
def get_download_url(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
def get_download_url_or_path(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
-> str:
if not SupermarketChain.is_valid_store_id(store_id):
raise ValueError(f"Invalid {store_id=} (store id must be a natural number)")
hostname = f"http://{self.hostname_prefix}{self.hostname_suffix}"
url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"])
req_res: requests.Response = session.get(url)
@@ -27,7 +29,7 @@ class BinaProjectWebClient:
if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files):
return "" # Could not find non-full Promos/Prices file
else:
filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname
filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname and 'null' not in fname
suffix = next(
cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY]))
down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix])

View File

@@ -1,35 +1,99 @@
import json
import re
import logging
import os
import shutil
import time
from abc import abstractmethod
import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from supermarket_chain import SupermarketChain
class CerberusWebClient:
def get_download_url(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
-> str:
hostname: str = "https://publishedprices.co.il"
# Post the payload to the site to log in
session.post(hostname + "/login/user", data={'username': self.username})
# Scrape the data
ajax_dir_payload: dict = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
s: requests.Response = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
s_json: dict = json.loads(s.text)
if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] and not re.search('full', d['name'], re.IGNORECASE)
if not any(filter_func(d, store_id) for d in s_json['aaData']):
return "" # Could not find non-full Prices/Promos file
else:
filter_func = lambda d, id: f'-{id:03d}-20' in d['name']
suffix: str = next(d['name'] for d in s_json['aaData'] if filter_func(d, store_id))
download_url: str = hostname + "/file/d/" + suffix
return download_url
class CerberusWebClient(SupermarketChain):
@property
@abstractmethod
def username(self):
return repr(type(self))
pass
def get_download_url_or_path(
self,
store_id: int,
category: SupermarketChain.XMLFilesCategory,
session: requests.Session,
) -> str:
options = webdriver.ChromeOptions()
options.add_argument("ignore-certificate-errors")
options.add_argument("--ignore-ssl-errors=yes")
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=options
)
driver.get("https://url.retail.publishedprices.co.il/login#")
time.sleep(2)
userElem = driver.find_element(By.NAME, "username")
userElem.send_keys(self.username)
driver.find_element(By.NAME, "Submit").click()
time.sleep(2)
searchElem = driver.find_element(By.CLASS_NAME, "form-control")
searchElem.send_keys(category.value)
time.sleep(5)
conns = driver.find_elements(By.CLASS_NAME, "f")
best_link = ""
for conn in conns:
link = conn.get_attribute("href").lower()
if category == SupermarketChain.XMLFilesCategory.Promos:
filter_func = (
lambda l: "promo" in l
and "full" not in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.PromosFull:
filter_func = (
lambda l: "promo" in l
and "full" in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.Prices:
filter_func = (
lambda l: "price" in l
and "full" not in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.PricesFull:
filter_func = (
lambda l: "price" in l
and "full" in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.Stores:
filter_func = lambda l: "store" in l and "full" in l and f"-000-20" in l
else:
raise ValueError(f"Unknown category type: {category=}")
if filter_func(link):
if not best_link or int(link[-7:-3]) > int(best_link[-7:-3]):
best_link = link
if not best_link:
return ""
driver.get(best_link)
time.sleep(3)
download_dir = "/Users/korenlazar/Downloads"
filename = best_link[48:]
path_download = os.path.join(download_dir, filename)
logging.info(f"{path_download=}")
path_to_save = f"raw_files/{self.username}-{filename}"
try:
shutil.move(path_download, path_to_save)
print(f"Downloaded {filename} and moved file to {path_to_save}")
except:
print(f"{filename} already exists in {path_to_save}")
return path_to_save

View File

@@ -1,6 +1,5 @@
from chains.mahsaneiHashook import MahsaneiHashook
from supermarket_chain import SupermarketChain
class CoOp(MahsaneiHashook, SupermarketChain):
class CoOp(MahsaneiHashook):
pass

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class DorAlon(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class DorAlon(CerberusWebClient):
@property
def username(self):
return "doralon"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class Freshmarket(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class Freshmarket(CerberusWebClient):
_date_hour_format = "%Y-%m-%d %H:%M:%S"
@property
def username(self):
return "freshmarket"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class HaziHinam(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class HaziHinam(CerberusWebClient):
@property
def username(self):
return "HaziHinam"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class Keshet(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class Keshet(CerberusWebClient):
@property
def username(self):
return "Keshet"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,7 +1,6 @@
from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class KingStore(BinaProjectWebClient, SupermarketChain):
class KingStore(BinaProjectWebClient):
_path_prefix = "Food_Law"
_hostname_suffix = ".co.il"

View File

@@ -1,6 +1,5 @@
from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class Maayan2000(BinaProjectWebClient, SupermarketChain):
pass
class Maayan2000(BinaProjectWebClient):
pass

View File

@@ -1,5 +1,6 @@
import re
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
@@ -9,33 +10,46 @@ from supermarket_chain import SupermarketChain
class MahsaneiHashook(SupermarketChain):
_promotion_tag_name = 'Sale'
_promotion_update_tag_name = 'PriceUpdateDate'
_date_format = '%Y/%m/%d'
_date_hour_format = '%Y/%m/%d %H:%M:%S'
_update_date_format = '%Y/%m/%d %H:%M:%S'
_item_tag_name = 'Product'
_promotion_tag_name = "Sale"
_promotion_update_tag_name = "PriceUpdateDate"
_date_format = "%Y/%m/%d"
_date_hour_format = "%Y/%m/%d %H:%M:%S"
_update_date_format = "%Y/%m/%d %H:%M:%S"
_item_tag_name = "Product"
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
def get_download_url_or_path(
store_id: int,
category: SupermarketChain.XMLFilesCategory,
session: requests.Session,
) -> str:
prefix = "http://matrixcatalog.co.il/"
url = prefix + "NBCompetitionRegulations.aspx"
req_res: requests.Response = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml')
if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
and f'-{store_id:03d}-20' in fname \
and not re.search('full', fname, re.IGNORECASE)
if soup.find('a', href=fname_filter_func) is None:
soup = BeautifulSoup(req_res.text, features="lxml")
if category in [
SupermarketChain.XMLFilesCategory.Promos,
SupermarketChain.XMLFilesCategory.Prices,
]:
fname_filter_func = (
lambda fname: fname
and category.name.replace("s", "") in fname
and f"-{store_id:03d}-20" in fname
and not re.search("full", fname, re.IGNORECASE)
)
if soup.find("a", href=fname_filter_func) is None:
return "" # Could not find non-full Promos/Prices file
else:
fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
and f'-{store_id:03d}-20' in fname
suffix: str = soup.find('a', href=fname_filter_func).attrs['href']
fname_filter_func = (
lambda fname: fname
and category.name.replace("s", "") in fname
and f"-{store_id:03d}-20" in fname
)
suffix: str = soup.find("a", href=fname_filter_func).attrs["href"]
down_url: str = prefix + suffix
return down_url
@staticmethod
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
promo_item = items_dict.get(promo.find('ItemCode').text)
promo_item = items_dict.get(promo.find("ItemCode").text)
return [promo_item] if promo_item else []

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class OsherAd(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class OsherAd(CerberusWebClient):
@property
def username(self):
return "osherad"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class RamiLevi(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class RamiLevi(CerberusWebClient):
@property
def username(self):
return "RamiLevi"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,5 @@
from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class ShefaBirkatHashem(BinaProjectWebClient, SupermarketChain):
pass
class ShefaBirkatHashem(BinaProjectWebClient):
pass

View File

@@ -1,7 +1,7 @@
from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class ShukHayir(BinaProjectWebClient, SupermarketChain):
class ShukHayir(BinaProjectWebClient):
@property
def hostname_prefix(self): return "shuk-hayir"
def hostname_prefix(self):
return "shuk-hayir"

View File

@@ -1,9 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class StopMarket(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class StopMarket(CerberusWebClient):
_date_hour_format = "%Y-%m-%d %H:%M:%S"
@property
def username(self):
return 'Stop_Market'
return "Stop_Market"

View File

@@ -1,6 +1,7 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class TivTaam(CerberusWebClient, SupermarketChain):
pass
class TivTaam(CerberusWebClient):
@property
def username(self):
return "TivTaam"

View File

@@ -1,6 +1,5 @@
from chains.mahsaneiHashook import MahsaneiHashook
from supermarket_chain import SupermarketChain
class Victory(MahsaneiHashook, SupermarketChain):
class Victory(MahsaneiHashook):
pass

0
chains/yeinot_bitan.py Normal file
View File

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class Yohananof(CerberusWebClient, SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S'
class Yohananof(CerberusWebClient):
@property
def username(self):
return "yohananof"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,5 @@
from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class ZolVebegadol(BinaProjectWebClient, SupermarketChain):
class ZolVebegadol(BinaProjectWebClient):
pass