Fixed the bug with cerberus_web_client.py by working with Selenium. To login each chain working with it must have a username for login with Selenium. in this mechanism, a path to a gz file is returned instead of url

Added the option to output a prices json file in main.py under --prices-with-promos, where the prices are updated by the latest promotions (under the 'final_price' key, where 'price' represents the price before promotions).

Fixed small bug of BinaWebCleint by checking that filename does not contain 'null'.

Changed Hierarchy of chains such that it includes the webclients.

Added the date to the output filenames to start storing the data over time.

Black formatting (according to pip 8 guidelines).

Changed the chains_dict in main to a constant one.
This commit is contained in:
korenlazar
2022-10-04 11:42:36 +03:00
parent b5db721a3d
commit ceff48dbd9
28 changed files with 796 additions and 406 deletions

View File

@@ -1,6 +1,5 @@
from chains.mahsaneiHashook import MahsaneiHashook from chains.mahsaneiHashook import MahsaneiHashook
from supermarket_chain import SupermarketChain
class Bareket(MahsaneiHashook, SupermarketChain): class Bareket(MahsaneiHashook):
pass pass

View File

@@ -8,14 +8,16 @@ from supermarket_chain import SupermarketChain
FNAME_KEY = "FileNm" FNAME_KEY = "FileNm"
class BinaProjectWebClient: class BinaProjectWebClient(SupermarketChain):
_date_hour_format = '%Y-%m-%d %H:%M:%S' _date_hour_format = '%Y-%m-%d %H:%M:%S'
_update_date_format = '%Y-%m-%d %H:%M:%S' _update_date_format = '%Y-%m-%d %H:%M:%S'
_path_prefix = "" _path_prefix = ""
_hostname_suffix = ".binaprojects.com" _hostname_suffix = ".binaprojects.com"
def get_download_url(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \ def get_download_url_or_path(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
-> str: -> str:
if not SupermarketChain.is_valid_store_id(store_id):
raise ValueError(f"Invalid {store_id=} (store id must be a natural number)")
hostname = f"http://{self.hostname_prefix}{self.hostname_suffix}" hostname = f"http://{self.hostname_prefix}{self.hostname_suffix}"
url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"]) url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"])
req_res: requests.Response = session.get(url) req_res: requests.Response = session.get(url)
@@ -27,7 +29,7 @@ class BinaProjectWebClient:
if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files): if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files):
return "" # Could not find non-full Promos/Prices file return "" # Could not find non-full Promos/Prices file
else: else:
filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname and 'null' not in fname
suffix = next( suffix = next(
cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY])) cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY]))
down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix]) down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix])

View File

@@ -1,35 +1,99 @@
import json import logging
import re import os
import shutil
import time
from abc import abstractmethod
import requests import requests
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
class CerberusWebClient: class CerberusWebClient(SupermarketChain):
def get_download_url(self, store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) \
-> str:
hostname: str = "https://publishedprices.co.il"
# Post the payload to the site to log in
session.post(hostname + "/login/user", data={'username': self.username})
# Scrape the data
ajax_dir_payload: dict = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
s: requests.Response = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
s_json: dict = json.loads(s.text)
if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] and not re.search('full', d['name'], re.IGNORECASE)
if not any(filter_func(d, store_id) for d in s_json['aaData']):
return "" # Could not find non-full Prices/Promos file
else:
filter_func = lambda d, id: f'-{id:03d}-20' in d['name']
suffix: str = next(d['name'] for d in s_json['aaData'] if filter_func(d, store_id))
download_url: str = hostname + "/file/d/" + suffix
return download_url
@property @property
@abstractmethod
def username(self): def username(self):
return repr(type(self)) pass
def get_download_url_or_path(
self,
store_id: int,
category: SupermarketChain.XMLFilesCategory,
session: requests.Session,
) -> str:
options = webdriver.ChromeOptions()
options.add_argument("ignore-certificate-errors")
options.add_argument("--ignore-ssl-errors=yes")
driver = webdriver.Chrome(
service=Service(ChromeDriverManager().install()), options=options
)
driver.get("https://url.retail.publishedprices.co.il/login#")
time.sleep(2)
userElem = driver.find_element(By.NAME, "username")
userElem.send_keys(self.username)
driver.find_element(By.NAME, "Submit").click()
time.sleep(2)
searchElem = driver.find_element(By.CLASS_NAME, "form-control")
searchElem.send_keys(category.value)
time.sleep(5)
conns = driver.find_elements(By.CLASS_NAME, "f")
best_link = ""
for conn in conns:
link = conn.get_attribute("href").lower()
if category == SupermarketChain.XMLFilesCategory.Promos:
filter_func = (
lambda l: "promo" in l
and "full" not in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.PromosFull:
filter_func = (
lambda l: "promo" in l
and "full" in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.Prices:
filter_func = (
lambda l: "price" in l
and "full" not in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.PricesFull:
filter_func = (
lambda l: "price" in l
and "full" in l
and f"-{store_id:03d}-20" in l
)
elif category == SupermarketChain.XMLFilesCategory.Stores:
filter_func = lambda l: "store" in l and "full" in l and f"-000-20" in l
else:
raise ValueError(f"Unknown category type: {category=}")
if filter_func(link):
if not best_link or int(link[-7:-3]) > int(best_link[-7:-3]):
best_link = link
if not best_link:
return ""
driver.get(best_link)
time.sleep(3)
download_dir = "/Users/korenlazar/Downloads"
filename = best_link[48:]
path_download = os.path.join(download_dir, filename)
logging.info(f"{path_download=}")
path_to_save = f"raw_files/{self.username}-{filename}"
try:
shutil.move(path_download, path_to_save)
print(f"Downloaded {filename} and moved file to {path_to_save}")
except:
print(f"{filename} already exists in {path_to_save}")
return path_to_save

View File

@@ -1,6 +1,5 @@
from chains.mahsaneiHashook import MahsaneiHashook from chains.mahsaneiHashook import MahsaneiHashook
from supermarket_chain import SupermarketChain
class CoOp(MahsaneiHashook, SupermarketChain): class CoOp(MahsaneiHashook):
pass pass

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class DorAlon(CerberusWebClient, SupermarketChain): class DorAlon(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' @property
def username(self):
return "doralon"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class Freshmarket(CerberusWebClient, SupermarketChain): class Freshmarket(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' _date_hour_format = "%Y-%m-%d %H:%M:%S"
@property
def username(self):
return "freshmarket"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class HaziHinam(CerberusWebClient, SupermarketChain): class HaziHinam(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' @property
def username(self):
return "HaziHinam"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class Keshet(CerberusWebClient, SupermarketChain): class Keshet(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' @property
def username(self):
return "Keshet"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,7 +1,6 @@
from chains.binaproject_web_client import BinaProjectWebClient from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class KingStore(BinaProjectWebClient, SupermarketChain): class KingStore(BinaProjectWebClient):
_path_prefix = "Food_Law" _path_prefix = "Food_Law"
_hostname_suffix = ".co.il" _hostname_suffix = ".co.il"

View File

@@ -1,6 +1,5 @@
from chains.binaproject_web_client import BinaProjectWebClient from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class Maayan2000(BinaProjectWebClient, SupermarketChain): class Maayan2000(BinaProjectWebClient):
pass pass

View File

@@ -1,5 +1,6 @@
import re import re
from typing import Dict, List from typing import Dict, List
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
@@ -9,33 +10,46 @@ from supermarket_chain import SupermarketChain
class MahsaneiHashook(SupermarketChain): class MahsaneiHashook(SupermarketChain):
_promotion_tag_name = 'Sale' _promotion_tag_name = "Sale"
_promotion_update_tag_name = 'PriceUpdateDate' _promotion_update_tag_name = "PriceUpdateDate"
_date_format = '%Y/%m/%d' _date_format = "%Y/%m/%d"
_date_hour_format = '%Y/%m/%d %H:%M:%S' _date_hour_format = "%Y/%m/%d %H:%M:%S"
_update_date_format = '%Y/%m/%d %H:%M:%S' _update_date_format = "%Y/%m/%d %H:%M:%S"
_item_tag_name = 'Product' _item_tag_name = "Product"
@staticmethod @staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str: def get_download_url_or_path(
store_id: int,
category: SupermarketChain.XMLFilesCategory,
session: requests.Session,
) -> str:
prefix = "http://matrixcatalog.co.il/" prefix = "http://matrixcatalog.co.il/"
url = prefix + "NBCompetitionRegulations.aspx" url = prefix + "NBCompetitionRegulations.aspx"
req_res: requests.Response = requests.get(url) req_res: requests.Response = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml') soup = BeautifulSoup(req_res.text, features="lxml")
if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]: if category in [
fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \ SupermarketChain.XMLFilesCategory.Promos,
and f'-{store_id:03d}-20' in fname \ SupermarketChain.XMLFilesCategory.Prices,
and not re.search('full', fname, re.IGNORECASE) ]:
if soup.find('a', href=fname_filter_func) is None: fname_filter_func = (
lambda fname: fname
and category.name.replace("s", "") in fname
and f"-{store_id:03d}-20" in fname
and not re.search("full", fname, re.IGNORECASE)
)
if soup.find("a", href=fname_filter_func) is None:
return "" # Could not find non-full Promos/Prices file return "" # Could not find non-full Promos/Prices file
else: else:
fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \ fname_filter_func = (
and f'-{store_id:03d}-20' in fname lambda fname: fname
suffix: str = soup.find('a', href=fname_filter_func).attrs['href'] and category.name.replace("s", "") in fname
and f"-{store_id:03d}-20" in fname
)
suffix: str = soup.find("a", href=fname_filter_func).attrs["href"]
down_url: str = prefix + suffix down_url: str = prefix + suffix
return down_url return down_url
@staticmethod @staticmethod
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]: def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
promo_item = items_dict.get(promo.find('ItemCode').text) promo_item = items_dict.get(promo.find("ItemCode").text)
return [promo_item] if promo_item else [] return [promo_item] if promo_item else []

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class OsherAd(CerberusWebClient, SupermarketChain): class OsherAd(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' @property
def username(self):
return "osherad"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class RamiLevi(CerberusWebClient, SupermarketChain): class RamiLevi(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' @property
def username(self):
return "RamiLevi"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,5 @@
from chains.binaproject_web_client import BinaProjectWebClient from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class ShefaBirkatHashem(BinaProjectWebClient, SupermarketChain): class ShefaBirkatHashem(BinaProjectWebClient):
pass pass

View File

@@ -1,7 +1,7 @@
from chains.binaproject_web_client import BinaProjectWebClient from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class ShukHayir(BinaProjectWebClient, SupermarketChain): class ShukHayir(BinaProjectWebClient):
@property @property
def hostname_prefix(self): return "shuk-hayir" def hostname_prefix(self):
return "shuk-hayir"

View File

@@ -1,9 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class StopMarket(CerberusWebClient, SupermarketChain): class StopMarket(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' _date_hour_format = "%Y-%m-%d %H:%M:%S"
@property @property
def username(self): def username(self):
return 'Stop_Market' return "Stop_Market"

View File

@@ -1,6 +1,7 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class TivTaam(CerberusWebClient, SupermarketChain): class TivTaam(CerberusWebClient):
pass @property
def username(self):
return "TivTaam"

View File

@@ -1,6 +1,5 @@
from chains.mahsaneiHashook import MahsaneiHashook from chains.mahsaneiHashook import MahsaneiHashook
from supermarket_chain import SupermarketChain
class Victory(MahsaneiHashook, SupermarketChain): class Victory(MahsaneiHashook):
pass pass

0
chains/yeinot_bitan.py Normal file
View File

View File

@@ -1,6 +1,9 @@
from chains.cerberus_web_client import CerberusWebClient from chains.cerberus_web_client import CerberusWebClient
from supermarket_chain import SupermarketChain
class Yohananof(CerberusWebClient, SupermarketChain): class Yohananof(CerberusWebClient):
_date_hour_format = '%Y-%m-%d %H:%M:%S' @property
def username(self):
return "yohananof"
_date_hour_format = "%Y-%m-%d %H:%M:%S"

View File

@@ -1,6 +1,5 @@
from chains.binaproject_web_client import BinaProjectWebClient from chains.binaproject_web_client import BinaProjectWebClient
from supermarket_chain import SupermarketChain
class ZolVebegadol(BinaProjectWebClient, SupermarketChain): class ZolVebegadol(BinaProjectWebClient):
pass pass

32
item.py
View File

@@ -1,14 +1,44 @@
import json
import re
from bs4.element import Tag
class Item: class Item:
""" """
A class representing a product in some supermarket. A class representing a product in some supermarket.
""" """
def __init__(self, name: str, price: float, price_by_measure: float, code: str, manufacturer: str): def __init__(
self,
name: str,
price: float,
price_by_measure: float,
code: str,
manufacturer: str,
):
self.name: str = name self.name: str = name
self.price: float = price self.price: float = price
self.final_price: float = price
self.price_by_measure = price_by_measure self.price_by_measure = price_by_measure
self.manufacturer: str = manufacturer self.manufacturer: str = manufacturer
self.code: str = code self.code: str = code
@classmethod
def from_tag(cls, item: Tag):
"""
This method creates an Item instance from an xml tag.
"""
return cls(
name=item.find(re.compile(r"ItemN[a]?m[e]?")).text,
price=float(item.find("ItemPrice").text),
price_by_measure=float(item.find("UnitOfMeasurePrice").text),
code=item.find("ItemCode").text,
manufacturer=item.find(re.compile(r"Manufacture[r]?Name")).text,
)
def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__)
def __repr__(self): def __repr__(self):
return f"\nשם: {self.name}\nמחיר: {self.price}\nיצרן: {self.manufacturer}\nקוד: {self.code}\n" return f"\nשם: {self.name}\nמחיר: {self.price}\nיצרן: {self.manufacturer}\nקוד: {self.code}\n"

284
main.py
View File

@@ -1,127 +1,212 @@
import os import json
from argparse import ArgumentParser
from datetime import datetime
from pathlib import Path
import logging import logging
import os
import subprocess
import sys
from argparse import ArgumentParser
from datetime import datetime, date
from pathlib import Path
from promotion import main_latest_promos, log_promos_by_name from chains.bareket import Bareket
from chains.co_op import CoOp
from chains.dor_alon import DorAlon
from chains.freshmarket import Freshmarket
from chains.hazi_hinam import HaziHinam
from chains.keshet import Keshet
from chains.king_store import KingStore
from chains.maayan2000 import Maayan2000
from chains.mahsaneiHashook import MahsaneiHashook
from chains.osher_ad import OsherAd
from chains.rami_levi import RamiLevi
from chains.shefa_birkat_hashem import ShefaBirkatHashem
from chains.shufersal import Shufersal
from chains.shuk_hayir import ShukHayir
from chains.stop_market import StopMarket
from chains.tiv_taam import TivTaam
from chains.victory import Victory
from chains.yohananof import Yohananof
from chains.zol_vebegadol import ZolVebegadol
from promotion import main_latest_promos, log_promos_by_name, get_all_prices
from store_utils import log_stores_ids from store_utils import log_stores_ids
from utils import RESULTS_DIRNAME, RAW_FILES_DIRNAME, VALID_PROMOTION_FILE_EXTENSIONS, log_products_prices, \
valid_promotion_output_file
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
from chains import ( from utils import (
bareket, RESULTS_DIRNAME,
mahsaneiHashook, RAW_FILES_DIRNAME,
dor_alon, VALID_PROMOTION_FILE_EXTENSIONS,
freshmarket, log_products_prices,
hazi_hinam, valid_promotion_output_file,
keshet, is_valid_promotion_output_file,
stop_market,
tiv_taam,
shufersal,
co_op,
victory,
yohananof,
zol_vebegadol,
rami_levi,
osher_ad,
maayan2000,
shuk_hayir,
king_store,
shefa_birkat_hashem,
) )
CHAINS_LIST = [
Bareket,
MahsaneiHashook,
DorAlon,
Freshmarket,
HaziHinam,
Keshet,
StopMarket,
TivTaam,
Shufersal,
CoOp,
Victory,
Yohananof,
ZolVebegadol,
RamiLevi,
OsherAd,
Maayan2000,
ShukHayir,
KingStore,
ShefaBirkatHashem,
]
Path(RESULTS_DIRNAME).mkdir(exist_ok=True) Path(RESULTS_DIRNAME).mkdir(exist_ok=True)
Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True) Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True)
chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()} CHAINS_DICT = {
repr(chain): chain() if callable(chain) else None for chain in CHAINS_LIST
}
# TODO: change functions arguments to include all necessary parameters (e.g. chain) or split arguments # TODO: change functions arguments to include all necessary parameters (e.g. chain) or split arguments
if __name__ == '__main__': if __name__ == "__main__":
parser = ArgumentParser() parser = ArgumentParser()
parser.add_argument('--promos', parser.add_argument(
help="generates a CSV file with all the promotions in the requested store", "--promos",
metavar='store_id', help="generates a CSV file with all the promotions in the requested store",
nargs=1, metavar="store_id",
type=SupermarketChain.store_id_type, nargs=1,
) type=SupermarketChain.store_id_type,
parser.add_argument('--find_promos_by_name', )
help="prints all promos containing the given promo_name in the given store", parser.add_argument(
metavar=('store_id', 'promo_name'), "--find_promos_by_name",
nargs=2, help="prints all promos containing the given promo_name in the given store",
# type=store_id_type, # TODO: add type-checking of first parameter metavar=("store_id", "promo_name"),
) nargs=2,
parser.add_argument('--price', )
help='prints all products that contain the given name in the requested store', parser.add_argument(
metavar=('store_id', 'product_name'), "--price",
nargs=2, help="prints all products that contain the given name in the requested store",
) metavar=("store_id", "product_name"),
parser.add_argument('--find_store_id', nargs=2,
help='prints all Shufersal stores in a given city. Input should be a city name in Hebrew', )
metavar='city', parser.add_argument(
nargs=1, "--prices-with-promos",
) help="logs all products with prices updated by promos",
parser.add_argument('--load_prices', metavar="store_id",
help='boolean flag representing whether to load an existing price XML file', nargs=1,
action='store_true', type=SupermarketChain.store_id_type,
) )
parser.add_argument('--load_promos', parser.add_argument(
help='boolean flag representing whether to load an existing promo XML file', "--find_store_id",
action='store_true', help="prints all Shufersal stores in a given city. Input should be a city name in Hebrew",
) metavar="city",
parser.add_argument('--load_stores', nargs=1,
help='boolean flag representing whether to load an existing stores XML file', )
action='store_true', parser.add_argument(
) "--load_prices",
parser.add_argument('--chain', help="boolean flag representing whether to load an existing price XML file",
required=True, action="store_true",
help='The name of the requested chain', )
choices=chain_dict.keys(), parser.add_argument(
) "--load_promos",
parser.add_argument('--file_extension', help="boolean flag representing whether to load an existing promo XML file",
help='The extension of the promotions output file', action="store_true",
choices=VALID_PROMOTION_FILE_EXTENSIONS, )
default='.xlsx', parser.add_argument(
) "--load_stores",
parser.add_argument('--output_filename', help="boolean flag representing whether to load an existing stores XML file",
help='The path to write the promotions table to', action="store_true",
type=valid_promotion_output_file, )
) parser.add_argument(
parser.add_argument('--only_export_to_file', "--chain",
help='Boolean flag representing whether only export or also open the promotion output file', required=True,
action='store_true', help="The name of the requested chain",
) choices=CHAINS_DICT.keys(),
parser.add_argument('--debug', )
help='Boolean flag representing whether to run in debug mode', parser.add_argument(
action='store_true', "--output_filename",
) help="The path to write the promotions/prices to",
type=valid_promotion_output_file,
)
parser.add_argument(
"--only_export_to_file",
help="Boolean flag representing whether only export or also open the promotion output file",
action="store_true",
)
parser.add_argument(
"--debug",
help="Boolean flag representing whether to run in debug mode",
action="store_true",
)
args = parser.parse_args() args = parser.parse_args()
if args.debug: if args.debug:
logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.DEBUG)
else: else:
logging.basicConfig(level=logging.INFO, format='%(levelname)s:%(message)s') logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(message)s")
chain: SupermarketChain = chain_dict[args.chain] chain: SupermarketChain = CHAINS_DICT[args.chain]
if args.promos: if args.promos or args.prices_with_promos:
arg_store_id = int(args.promos[0]) arg_store_id = (
int(args.promos[0]) if args.promos else int(args.prices_with_promos[0])
)
if args.output_filename: if args.output_filename:
output_filename = args.output_filename output_filename = args.output_filename
if args.promos and not is_valid_promotion_output_file(output_filename):
raise ValueError(
f"Output filename for promos must end with: {VALID_PROMOTION_FILE_EXTENSIONS}"
)
if args.prices_with_promos and not output_filename.endswith(".json"):
raise ValueError(f"Output filename for promos must be a json file")
directory = os.path.dirname(output_filename) directory = os.path.dirname(output_filename)
Path(directory).mkdir(parents=True, exist_ok=True) Path(directory).mkdir(parents=True, exist_ok=True)
else: else:
Path(RESULTS_DIRNAME).mkdir(exist_ok=True) Path(RESULTS_DIRNAME).mkdir(exist_ok=True)
output_filename = f'{RESULTS_DIRNAME}/{repr(type(chain))}_promos_{arg_store_id}{args.file_extension}' file_extension = ".xlsx" if args.promos else ".json"
file_type = "promos" if args.promos else "prices"
output_filename = f"{RESULTS_DIRNAME}/{repr(type(chain))}-{file_type}-{arg_store_id}-{date.today()}{file_extension}"
if args.promos:
main_latest_promos(
store_id=arg_store_id,
output_filename=output_filename,
chain=chain,
load_promos=args.load_promos,
load_prices=args.load_prices,
)
else:
items_dict = get_all_prices(
store_id=arg_store_id,
output_filename=output_filename,
chain=chain,
load_promos=args.load_promos,
load_prices=args.load_prices,
)
items_dict_to_json = {
item_code: {
k: v
for k, v in item.__dict__.items()
if not k.startswith("__") and not callable(k)
}
for item_code, item in items_dict.items()
}
with open(output_filename, "w") as fOut:
json.dump(items_dict_to_json, fOut)
main_latest_promos(store_id=arg_store_id, output_filename=output_filename, chain=chain,
load_promos=args.load_promos, load_xml=args.load_prices)
if not args.only_export_to_file: if not args.only_export_to_file:
os.startfile(Path(output_filename)) opener = "open" if sys.platform == "darwin" else "xdg-open"
logging.debug(f'Process finished at: {datetime.now()}') subprocess.call([opener, Path(output_filename)])
# os.startfile(Path(output_filename))
logging.debug(f"Process finished at: {datetime.now()}")
elif args.price: elif args.price:
log_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1]) log_products_prices(
chain,
store_id=args.price[0],
load_xml=args.load_prices,
product_name=args.price[1],
)
elif args.find_store_id: elif args.find_store_id:
arg_city = args.find_store_id[0] arg_city = args.find_store_id[0]
@@ -129,5 +214,10 @@ if __name__ == '__main__':
elif args.find_promos_by_name: elif args.find_promos_by_name:
arg_store_id = int(args.find_promos_by_name[0]) arg_store_id = int(args.find_promos_by_name[0])
log_promos_by_name(store_id=arg_store_id, chain=chain, promo_name=args.find_promos_by_name[1], log_promos_by_name(
load_prices=args.load_prices, load_promos=args.load_promos) store_id=arg_store_id,
chain=chain,
promo_name=args.find_promos_by_name[1],
load_prices=args.load_prices,
load_promos=args.load_promos,
)

View File

@@ -12,45 +12,51 @@ from aenum import Enum
from item import Item from item import Item
from utils import ( from utils import (
create_bs_object, create_items_dict, create_bs_object,
create_items_dict,
get_float_from_tag, get_float_from_tag,
log_message_and_time_if_debug, xml_file_gen, log_message_and_time_if_debug,
xml_file_gen,
) )
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
XML_FILES_PROMOTIONS_CATEGORIES = [SupermarketChain.XMLFilesCategory.PromosFull, XML_FILES_PROMOTIONS_CATEGORIES = [
SupermarketChain.XMLFilesCategory.Promos] SupermarketChain.XMLFilesCategory.PromosFull,
SupermarketChain.XMLFilesCategory.Promos,
]
PROMOTION_COLS_NUM = 15 # The length of the list returned by get_promotion_row_for_table function PROMOTION_COLS_NUM = (
15 # The length of the list returned by get_promotion_row_for_table function
)
INVALID_OR_UNKNOWN_PROMOTION_FUNCTION = -1 INVALID_OR_UNKNOWN_PROMOTION_FUNCTION = -1
PROMOTIONS_TABLE_HEADERS = [ PROMOTIONS_TABLE_HEADERS = [
'תיאור מבצע', "תיאור מבצע",
'הפריט המשתתף במבצע', "הפריט המשתתף במבצע",
'מחיר לפני מבצע', "מחיר לפני מבצע",
'מחיר אחרי מבצע', "מחיר אחרי מבצע",
'אחוז הנחה', "אחוז הנחה",
'סוג מבצע', "סוג מבצע",
'כמות מקס', "כמות מקס",
'כפל הנחות', "כפל הנחות",
'המבצע החל', "המבצע החל",
'זמן תחילת מבצע', "זמן תחילת מבצע",
'זמן סיום מבצע', "זמן סיום מבצע",
'זמן עדכון אחרון', "זמן עדכון אחרון",
'יצרן', "יצרן",
'ברקוד פריט', "ברקוד פריט",
'סוג מבצע לפי תקנות שקיפות מחירים', "סוג מבצע לפי תקנות שקיפות מחירים",
] ]
class ClubID(Enum): class ClubID(Enum):
_init_ = 'value string' _init_ = "value string"
REGULAR = 0, 'מבצע רגיל' REGULAR = 0, "מבצע רגיל"
CLUB = 1, 'מועדון' CLUB = 1, "מועדון"
CREDIT_CARD = 2, 'כרטיס אשראי' CREDIT_CARD = 2, "כרטיס אשראי"
OTHER = 3, 'אחר' OTHER = 3, "אחר"
@classmethod @classmethod
def _missing_(cls, value): def _missing_(cls, value):
@@ -79,9 +85,20 @@ class Promotion:
It contains only part of the available information in Shufersal's data. It contains only part of the available information in Shufersal's data.
""" """
def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime, items: List[Item], def __init__(
promo_func: callable, club_id: ClubID, promotion_id: int, max_qty: int, self,
allow_multiple_discounts: bool, reward_type: RewardType): content: str,
start_date: datetime,
end_date: datetime,
update_date: datetime,
items: List[Item],
promo_func: callable,
club_id: ClubID,
promotion_id: int,
max_qty: int,
allow_multiple_discounts: bool,
reward_type: RewardType,
):
self.content: str = content self.content: str = content
self.start_date: datetime = start_date self.start_date: datetime = start_date
self.end_date: datetime = end_date self.end_date: datetime = end_date
@@ -98,41 +115,49 @@ class Promotion:
title = self.content title = self.content
dates_range = f"Between {self.start_date} and {self.end_date}" dates_range = f"Between {self.start_date} and {self.end_date}"
update_line = f"Updated at {self.update_date}" update_line = f"Updated at {self.update_date}"
return '\n'.join([title, dates_range, update_line, str(self.items)]) + '\n' return "\n".join([title, dates_range, update_line, str(self.items)]) + "\n"
def __eq__(self, other): def __eq__(self, other):
return self.promotion_id == other.promotion_id return self.promotion_id == other.promotion_id
def write_promotions_to_table(promotions: List[Promotion], output_filename: str) -> None: def write_promotions_to_table(
promotions: List[Promotion], output_filename: str
) -> None:
""" """
This function writes a List of promotions to a csv or xlsx output file. This function writes a List of promotions to a csv or xlsx output file.
:param promotions: A given list of promotions :param promotions: A given list of promotions
:param output_filename: A given file to write to :param output_filename: A given file to write to
""" """
log_message_and_time_if_debug('Writing promotions to output file') log_message_and_time_if_debug("Writing promotions to output file")
rows = [get_promotion_row_for_table(promo, item) for promo in promotions for item in promo.items] rows = [
if output_filename.endswith('.csv'): get_promotion_row_for_table(promo, item)
for promo in promotions
for item in promo.items
]
if output_filename.endswith(".csv"):
encoding_file = "utf_8_sig" if sys.platform == "win32" else "utf_8" encoding_file = "utf_8_sig" if sys.platform == "win32" else "utf_8"
with open(output_filename, mode='w', newline='', encoding=encoding_file) as f_out: with open(
output_filename, mode="w", newline="", encoding=encoding_file
) as f_out:
promos_writer = csv.writer(f_out) promos_writer = csv.writer(f_out)
promos_writer.writerow(PROMOTIONS_TABLE_HEADERS) promos_writer.writerow(PROMOTIONS_TABLE_HEADERS)
promos_writer.writerows(rows) promos_writer.writerows(rows)
elif output_filename.endswith('.xlsx'): elif output_filename.endswith(".xlsx"):
df = pd.DataFrame(rows, columns=PROMOTIONS_TABLE_HEADERS) df = pd.DataFrame(rows, columns=PROMOTIONS_TABLE_HEADERS)
workbook = xlsxwriter.Workbook(output_filename) workbook = xlsxwriter.Workbook(output_filename)
worksheet1 = workbook.add_worksheet() worksheet1 = workbook.add_worksheet()
worksheet1.right_to_left() worksheet1.right_to_left()
date_time_format = workbook.add_format({'num_format': 'm/d/yy h:mm;@'}) date_time_format = workbook.add_format({"num_format": "m/d/yy h:mm;@"})
number_format = workbook.add_format({'num_format': '0.00'}) number_format = workbook.add_format({"num_format": "0.00"})
percentage_format = workbook.add_format({'num_format': '0.00%'}) percentage_format = workbook.add_format({"num_format": "0.00%"})
worksheet1.set_column('A:A', width=35) worksheet1.set_column("A:A", width=35)
worksheet1.set_column('B:B', width=25) worksheet1.set_column("B:B", width=25)
worksheet1.set_column('C:D', cell_format=number_format) worksheet1.set_column("C:D", cell_format=number_format)
worksheet1.set_column('E:E', cell_format=percentage_format) worksheet1.set_column("E:E", cell_format=percentage_format)
worksheet1.set_column('J:L', width=15, cell_format=date_time_format) worksheet1.set_column("J:L", width=15, cell_format=date_time_format)
worksheet1.add_table( worksheet1.add_table(
first_row=0, first_row=0,
first_col=0, first_col=0,
@@ -141,12 +166,15 @@ def write_promotions_to_table(promotions: List[Promotion], output_filename: str)
options={ options={
"columns": [{"header": i} for i in PROMOTIONS_TABLE_HEADERS], "columns": [{"header": i} for i in PROMOTIONS_TABLE_HEADERS],
"data": df.values.tolist(), "data": df.values.tolist(),
'style': 'Table Style Medium 11', "style": "Table Style Medium 11",
}, ) },
)
workbook.close() workbook.close()
else: else:
raise ValueError(f"The given output file has an invalid extension:\n{output_filename}") raise ValueError(
f"The given output file has an invalid extension:\n{output_filename}"
)
def get_promotion_row_for_table(promo: Promotion, item: Item) -> List: def get_promotion_row_for_table(promo: Promotion, item: Item) -> List:
@@ -175,8 +203,9 @@ def get_promotion_row_for_table(promo: Promotion, item: Item) -> List:
] ]
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos: bool) \ def get_available_promos(
-> List[Promotion]: chain: SupermarketChain, store_id: int, load_prices: bool, load_promos: bool
) -> List[Promotion]:
""" """
This function return the available promotions given a BeautifulSoup object. This function return the available promotions given a BeautifulSoup object.
@@ -186,15 +215,15 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
:param load_promos: A boolean representing whether to load an existing promotion file or download it :param load_promos: A boolean representing whether to load an existing promotion file or download it
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available :return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
""" """
log_message_and_time_if_debug('Importing prices XML file') log_message_and_time_if_debug("Importing prices XML file")
items_dict: Dict[str, Item] = create_items_dict(chain, store_id, load_prices) items_dict: Dict[str, Item] = create_items_dict(chain, store_id, load_prices)
log_message_and_time_if_debug('Importing promotions XML file') log_message_and_time_if_debug("Importing promotions XML file")
promo_tags = get_all_promos_tags(chain, store_id, load_promos) promo_tags = get_all_promos_tags(chain, store_id, load_promos)
log_message_and_time_if_debug('Creating promotions objects') log_message_and_time_if_debug("Creating promotions objects")
promo_objs = list() promo_objs = list()
for promo in tqdm(promo_tags, desc='creating_promotions'): for promo in tqdm(promo_tags, desc="creating_promotions"):
promotion_id = int(promo.find(re.compile('PromotionId', re.IGNORECASE)).text) promotion_id = int(promo.find(re.compile("PromotionId", re.IGNORECASE)).text)
if promo_objs and promo_objs[-1].promotion_id == promotion_id: if promo_objs and promo_objs[-1].promotion_id == promotion_id:
promo_objs[-1].items.extend(chain.get_items(promo, items_dict)) promo_objs[-1].items.extend(chain.get_items(promo, items_dict))
continue continue
@@ -206,8 +235,9 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
return promo_objs return promo_objs
def create_new_promo_instance(chain: SupermarketChain, items_dict: Dict[str, Item], promo: Tag, promotion_id: int) \ def create_new_promo_instance(
-> Union[Promotion, None]: chain: SupermarketChain, items_dict: Dict[str, Item], promo: Tag, promotion_id: int
) -> Union[Promotion, None]:
""" """
This function generates a Promotion object from a promotion tag. This function generates a Promotion object from a promotion tag.
@@ -217,41 +247,64 @@ def create_new_promo_instance(chain: SupermarketChain, items_dict: Dict[str, Ite
:param promotion_id: An integer representing the promotion ID :param promotion_id: An integer representing the promotion ID
:return: If the promotion expired - return None, else return the Promotion object :return: If the promotion expired - return None, else return the Promotion object
""" """
promo_end_time = datetime.strptime(promo.find('PromotionEndDate').text + ' ' + promo_end_time = datetime.strptime(
promo.find('PromotionEndHour').text, promo.find("PromotionEndDate").text + " " + promo.find("PromotionEndHour").text,
chain.date_hour_format) chain.date_hour_format,
)
if promo_end_time < datetime.now(): if promo_end_time < datetime.now():
return None return None
reward_type = RewardType(int(promo.find("RewardType").text)) reward_type = RewardType(int(promo.find("RewardType").text))
discounted_price = get_discounted_price(promo) discounted_price = get_discounted_price(promo)
promo_description = promo.find('PromotionDescription').text promo_description = promo.find("PromotionDescription").text
is_discount_in_percentage = reward_type == RewardType.DISCOUNT_IN_PERCENTAGE or not discounted_price is_discount_in_percentage = (
raw_discount_rate = promo.find('DiscountRate').text if promo.find('DiscountRate') else None reward_type == RewardType.DISCOUNT_IN_PERCENTAGE or not discounted_price
)
raw_discount_rate = (
promo.find("DiscountRate").text if promo.find("DiscountRate") else None
)
discount_rate = get_discount_rate(raw_discount_rate, is_discount_in_percentage) discount_rate = get_discount_rate(raw_discount_rate, is_discount_in_percentage)
min_qty = get_float_from_tag(promo, 'MinQty') min_qty = get_float_from_tag(promo, "MinQty")
max_qty = get_float_from_tag(promo, 'MaxQty') max_qty = get_float_from_tag(promo, "MaxQty")
remark = promo.find("Remark") remark = promo.find("Remark")
promo_func = find_promo_function(reward_type=reward_type, remark=remark.text if remark else '', promo_func = find_promo_function(
promo_description=promo_description, min_qty=min_qty, reward_type=reward_type,
discount_rate=discount_rate, discounted_price=discounted_price) remark=remark.text if remark else "",
promo_start_time = datetime.strptime(promo.find('PromotionStartDate').text + ' ' + promo_description=promo_description,
promo.find('PromotionStartHour').text, min_qty=min_qty,
chain.date_hour_format) discount_rate=discount_rate,
promo_update_time = datetime.strptime(promo.find(chain.promotion_update_tag_name).text, discounted_price=discounted_price,
chain.update_date_format) )
club_id = ClubID(int(promo.find(re.compile('ClubId', re.IGNORECASE)).text)) promo_start_time = datetime.strptime(
multiple_discounts_allowed = bool(int(promo.find('AllowMultipleDiscounts').text)) promo.find("PromotionStartDate").text
+ " "
+ promo.find("PromotionStartHour").text,
chain.date_hour_format,
)
promo_update_time = datetime.strptime(
promo.find(chain.promotion_update_tag_name).text, chain.update_date_format
)
club_id = ClubID(int(promo.find(re.compile("ClubId", re.IGNORECASE)).text))
multiple_discounts_allowed = bool(int(promo.find("AllowMultipleDiscounts").text))
items = chain.get_items(promo, items_dict) items = chain.get_items(promo, items_dict)
return Promotion(content=promo_description, start_date=promo_start_time, end_date=promo_end_time, return Promotion(
update_date=promo_update_time, items=items, promo_func=promo_func, content=promo_description,
club_id=club_id, promotion_id=promotion_id, max_qty=max_qty, start_date=promo_start_time,
allow_multiple_discounts=multiple_discounts_allowed, reward_type=reward_type) end_date=promo_end_time,
update_date=promo_update_time,
items=items,
promo_func=promo_func,
club_id=club_id,
promotion_id=promotion_id,
max_qty=max_qty,
allow_multiple_discounts=multiple_discounts_allowed,
reward_type=reward_type,
)
def get_discounted_price(promo): def get_discounted_price(promo):
discounted_price = promo.find('DiscountedPrice') discounted_price = promo.find("DiscountedPrice")
if discounted_price: if discounted_price:
return float(discounted_price.text) return float(discounted_price.text)
@@ -263,8 +316,14 @@ def get_discount_rate(discount_rate: Union[float, None], discount_in_percentage:
return float(discount_rate) return float(discount_rate)
def find_promo_function(reward_type: RewardType, remark: str, promo_description: str, min_qty: float, def find_promo_function(
discount_rate: Union[float, None], discounted_price: Union[float, None]): reward_type: RewardType,
remark: str,
promo_description: str,
min_qty: float,
discount_rate: Union[float, None],
discounted_price: Union[float, None],
):
if reward_type == RewardType.SECOND_INSTANCE_DIFFERENT_DISCOUNT: if reward_type == RewardType.SECOND_INSTANCE_DIFFERENT_DISCOUNT:
if not discounted_price: if not discounted_price:
return lambda item: item.price * (1 - (discount_rate / min_qty)) return lambda item: item.price * (1 - (discount_rate / min_qty))
@@ -277,7 +336,9 @@ def find_promo_function(reward_type: RewardType, remark: str, promo_description:
return lambda item: item.price * (1 - (1 / min_qty)) return lambda item: item.price * (1 - (1 / min_qty))
if reward_type == RewardType.DISCOUNT_IN_PERCENTAGE: if reward_type == RewardType.DISCOUNT_IN_PERCENTAGE:
return lambda item: item.price * (1 - discount_rate / (2 if "השני ב" in promo_description else 1)) return lambda item: item.price * (
1 - discount_rate / (2 if "השני ב" in promo_description else 1)
)
if reward_type == RewardType.SECOND_INSTANCE_SAME_DISCOUNT: if reward_type == RewardType.SECOND_INSTANCE_SAME_DISCOUNT:
if "השני ב" in promo_description: if "השני ב" in promo_description:
@@ -299,24 +360,73 @@ def find_promo_function(reward_type: RewardType, remark: str, promo_description:
return lambda item: INVALID_OR_UNKNOWN_PROMOTION_FUNCTION return lambda item: INVALID_OR_UNKNOWN_PROMOTION_FUNCTION
def main_latest_promos(store_id: int, output_filename, chain: SupermarketChain, load_promos: bool, def main_latest_promos(
load_xml: bool) -> None: store_id: int,
output_filename,
chain: SupermarketChain,
load_promos: bool,
load_prices: bool,
) -> None:
""" """
This function writes to a file the available promotions in a store with a given id sorted by their update date. This function writes to a file the available promotions in a store with a given id sorted by their update date.
:param chain: The name of the requested supermarket chain :param chain: The name of the requested supermarket chain
:param store_id: A given store id :param store_id: A given store id
:param load_xml: A boolean representing whether to load an existing prices xml file :param load_prices: A boolean representing whether to load an existing prices xml file
:param load_promos: A boolean representing whether to load an existing promos xml file :param load_promos: A boolean representing whether to load an existing promos xml file
:param output_filename: A path to write the promotions table :param output_filename: A path to write the promotions table
""" """
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos) promotions: List[Promotion] = get_available_promos(
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date - chain, store_id, load_prices, load_promos
promo.end_date), reverse=True) )
promotions.sort(
key=lambda promo: (
max(promo.update_date.date(), promo.start_date.date()),
promo.start_date - promo.end_date,
),
reverse=True,
)
write_promotions_to_table(promotions, output_filename) write_promotions_to_table(promotions, output_filename)
def log_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool): def get_all_prices(
store_id: int,
output_filename,
chain: SupermarketChain,
load_promos: bool,
load_prices: bool,
):
log_message_and_time_if_debug("Importing prices XML file")
items_dict: Dict[str, Item] = create_items_dict(chain, store_id, load_prices)
log_message_and_time_if_debug("Importing promotions XML file")
promo_tags = get_all_promos_tags(chain, store_id, load_promos)
log_message_and_time_if_debug("Creating promotions objects")
promo_obj = None
for promo in tqdm(promo_tags, desc="creating_promotions"):
promotion_id = int(promo.find(re.compile("PromotionId", re.IGNORECASE)).text)
if promo_obj is None or promo_obj.promotion_id != promotion_id:
promo_obj = create_new_promo_instance(
chain, items_dict, promo, promotion_id
)
for item in promo.find_all("Item"):
item_code = item.find("ItemCode").text
cur_item = items_dict.get(item_code)
if cur_item is not None:
discounted_price = promo_obj.promo_func(cur_item)
if cur_item.price > discounted_price:
cur_item.final_price = discounted_price
return items_dict
def log_promos_by_name(
store_id: int,
chain: SupermarketChain,
promo_name: str,
load_prices: bool,
load_promos: bool,
):
""" """
This function prints all promotions in a given chain and store_id containing a given promo_name. This function prints all promotions in a given chain and store_id containing a given promo_name.
@@ -326,7 +436,9 @@ def log_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str,
:param load_prices: A boolean representing whether to load an saved prices XML file or scrape a new one :param load_prices: A boolean representing whether to load an saved prices XML file or scrape a new one
:param load_promos: A boolean representing whether to load an saved XML file or scrape a new one :param load_promos: A boolean representing whether to load an saved XML file or scrape a new one
""" """
promotions: List[Promotion] = get_available_promos(chain, store_id, load_prices, load_promos) promotions: List[Promotion] = get_available_promos(
chain, store_id, load_prices, load_promos
)
for promo in promotions: for promo in promotions:
if promo_name in promo.content: if promo_name in promo.content:
logging.info(promo.repr_ltr()) logging.info(promo.repr_ltr())
@@ -339,10 +451,16 @@ def get_all_null_items_in_promos(chain, store_id) -> List[str]:
""" """
items_dict: Dict[str, Item] = create_items_dict(chain, store_id, load_xml=True) items_dict: Dict[str, Item] = create_items_dict(chain, store_id, load_xml=True)
promo_tags = get_all_promos_tags(chain, store_id, load_xml=True) promo_tags = get_all_promos_tags(chain, store_id, load_xml=True)
return [item for promo_tag in promo_tags for item in chain.get_null_items(promo_tag, items_dict)] return [
item
for promo_tag in promo_tags
for item in chain.get_null_items(promo_tag, items_dict)
]
def get_all_promos_tags(chain: SupermarketChain, store_id: int, load_xml: bool) -> List[Tag]: def get_all_promos_tags(
chain: SupermarketChain, store_id: int, load_xml: bool
) -> List[Tag]:
""" """
This function gets all the promotions tags for a given store in a given chain. This function gets all the promotions tags for a given store in a given chain.
It includes both the full and not full promotions files. It includes both the full and not full promotions files.
@@ -353,8 +471,14 @@ def get_all_promos_tags(chain: SupermarketChain, store_id: int, load_xml: bool)
:return: A list of promotions tags :return: A list of promotions tags
""" """
bs_objects = list() bs_objects = list()
for category in tqdm(XML_FILES_PROMOTIONS_CATEGORIES, desc='promotions_files'): for category in tqdm(XML_FILES_PROMOTIONS_CATEGORIES, desc="promotions_files"):
xml_path = xml_file_gen(chain, store_id, category.name) xml_path = xml_file_gen(chain, store_id, category.name)
bs_objects.append(create_bs_object(chain, store_id, category, load_xml, xml_path)) bs_objects.append(
create_bs_object(chain, store_id, category, load_xml, xml_path)
)
return [promo for bs_obj in bs_objects for promo in bs_obj.find_all(chain.promotion_tag_name)] return [
promo
for bs_obj in bs_objects
for promo in bs_obj.find_all(chain.promotion_tag_name)
]

View File

@@ -12,4 +12,6 @@ pytest~=6.2.2
pandas~=1.2.0 pandas~=1.2.0
argparse~=1.4.0 argparse~=1.4.0
XlsxWriter~=1.4.3 XlsxWriter~=1.4.3
aenum aenum
selenium
webdriver-manager

View File

@@ -1,10 +1,9 @@
import re
from abc import abstractmethod from abc import abstractmethod
from enum import Enum
from argparse import ArgumentTypeError from argparse import ArgumentTypeError
from typing import Dict, List from typing import Dict, List
import requests import requests
from aenum import StrEnum
from bs4.element import Tag from bs4.element import Tag
from item import Item from item import Item
@@ -20,18 +19,24 @@ class SupermarketChain(object, metaclass=Meta):
A class representing a supermarket chain. A class representing a supermarket chain.
""" """
class XMLFilesCategory(Enum): class XMLFilesCategory(StrEnum):
""" """
An enum class of different XML files produced by a supermarket chain An enum class of different XML files produced by a supermarket chain
""" """
All, Prices, PricesFull, Promos, PromosFull, Stores = range(6)
_promotion_tag_name = 'Promotion' All = ("All",)
_promotion_update_tag_name = 'PromotionUpdateDate' Prices = ("price",)
_date_format = '%Y-%m-%d' PricesFull = ("pricefull",)
_date_hour_format = '%Y-%m-%d %H:%M' Promos = ("promo",)
_update_date_format = '%Y-%m-%d %H:%M' PromosFull = ("promofull",)
_item_tag_name = 'Item' Stores = "store"
_promotion_tag_name = "Promotion"
_promotion_update_tag_name = "PromotionUpdateDate"
_date_format = "%Y-%m-%d"
_date_hour_format = "%Y-%m-%d %H:%M"
_update_date_format = "%Y-%m-%d %H:%M"
_item_tag_name = "Item"
@property @property
def promotion_tag_name(self): def promotion_tag_name(self):
@@ -75,14 +80,19 @@ class SupermarketChain(object, metaclass=Meta):
:return: The given store_id if valid, else raise an ArgumentTypeError. :return: The given store_id if valid, else raise an ArgumentTypeError.
""" """
if not SupermarketChain.is_valid_store_id(int(store_id)): if not SupermarketChain.is_valid_store_id(int(store_id)):
raise ArgumentTypeError(f"Given store_id: {store_id} is not a valid store_id.") raise ArgumentTypeError(
f"Given store_id: {store_id} is not a valid store_id."
)
return store_id return store_id
@staticmethod @staticmethod
@abstractmethod @abstractmethod
def get_download_url(store_id: int, category: XMLFilesCategory, session: requests.Session) -> str: def get_download_url_or_path(
store_id: int, category: XMLFilesCategory, session: requests.Session
) -> str:
""" """
This method scrapes supermarket's website and returns a url containing the data for a given store and category. This method scrapes the supermarket's website and according to the given store id and category,
it returns a url containing the data or or a path to a gz file containing the data.
:param store_id: A given ID of a store :param store_id: A given ID of a store
:param category: A given category :param category: A given category
@@ -100,8 +110,8 @@ class SupermarketChain(object, metaclass=Meta):
:param items_dict: A given dictionary of products :param items_dict: A given dictionary of products
""" """
items = list() items = list()
for item in promo.find_all('Item'): for item in promo.find_all("Item"):
item_code = item.find('ItemCode').text item_code = item.find("ItemCode").text
full_item_info = items_dict.get(item_code) full_item_info = items_dict.get(item_code)
if full_item_info: if full_item_info:
items.append(full_item_info) items.append(full_item_info)
@@ -112,14 +122,8 @@ class SupermarketChain(object, metaclass=Meta):
""" """
This function returns all the items in a given promotion which do not appear in the given items_dict. This function returns all the items in a given promotion which do not appear in the given items_dict.
""" """
return [item.find('ItemCode').text for item in promo.find_all('Item') return [
if not items_dict.get(item.find('ItemCode').text)] item.find("ItemCode").text
for item in promo.find_all("Item")
@staticmethod if not items_dict.get(item.find("ItemCode").text)
def get_item_info(item: Tag) -> Item: ]
"""
This function returns a string containing important information about a given supermarket's product.
"""
return Item(name=item.find(re.compile(r'ItemN[a]?m[e]?')).text, price=float(item.find('ItemPrice').text),
price_by_measure=float(item.find('UnitOfMeasurePrice').text), code=item.find('ItemCode').text,
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text)

View File

@@ -1,10 +1,11 @@
import logging import logging
import os import os
import re
import tempfile
import pandas as pd
import pytest import pytest
import requests import requests
from tqdm import tqdm
import pandas as pd
import re
from chains.bareket import Bareket from chains.bareket import Bareket
from chains.co_op import CoOp from chains.co_op import CoOp
@@ -14,89 +15,90 @@ from chains.shuk_hayir import ShukHayir
from chains.stop_market import StopMarket from chains.stop_market import StopMarket
from chains.tiv_taam import TivTaam from chains.tiv_taam import TivTaam
from chains.zol_vebegadol import ZolVebegadol from chains.zol_vebegadol import ZolVebegadol
from main import CHAINS_DICT
from promotion import PROMOTION_COLS_NUM, main_latest_promos from promotion import PROMOTION_COLS_NUM, main_latest_promos
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
from chains import (
bareket,
mahsaneiHashook,
dor_alon,
freshmarket,
hazi_hinam,
keshet,
stop_market,
tiv_taam,
shufersal,
co_op,
victory,
yohananof,
zol_vebegadol,
rami_levi,
osher_ad,
maayan2000,
shuk_hayir,
king_store,
shefa_birkat_hashem,
)
pytest.main(args=['-s', os.path.abspath(__file__)]) pytest.main(args=["-s", os.path.abspath(__file__)])
chain_dict = {repr(chain): chain() if callable(chain) else None for chain in SupermarketChain.__subclasses__()} session = requests.Session()
MIN_NUM_OF_PROMOS = 3 MIN_NUM_OF_PROMOS = 3
def test_searching_for_download_urls(): @pytest.mark.parametrize("chain_tuple", CHAINS_DICT.items())
def test_searching_for_download_urls(chain_tuple):
""" """
Test that get_download_url of each chain returns the correct download url for each category in every chain. Test that get_download_url of each chain returns the correct download url for each category in every chain.
""" """
session = requests.Session() chain_name, chain = chain_tuple
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): # for chain_name, chain in tqdm(chain_dict.items(), desc='chains'):
logging.info(f'Checking download urls in chain {chain_name}') logging.info(f"Checking download urls in chain {chain_name}")
store_id: int = valid_store_id_by_chain(chain_name) store_id: int = valid_store_id_by_chain(chain_name)
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PromosFull, r'promo[s]?full', session) _test_download_url_helper(
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Promos, r'promo[s]?', session) chain, store_id, chain.XMLFilesCategory.PromosFull, r"promo[s]?full", session
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.PricesFull, r'price[s]?full', session) )
_test_download_url_helper(chain, store_id, chain.XMLFilesCategory.Prices, r'price[s]?', session) _test_download_url_helper(
chain, store_id, chain.XMLFilesCategory.Promos, r"promo[s]?", session
)
_test_download_url_helper(
chain, store_id, chain.XMLFilesCategory.PricesFull, r"price[s]?full", session
)
_test_download_url_helper(
chain, store_id, chain.XMLFilesCategory.Prices, r"price[s]?", session
)
def _test_download_url_helper(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory, def _test_download_url_helper(
regex_pat: str, session: requests.session): chain: SupermarketChain,
download_url: str = chain.get_download_url(store_id, category, session) store_id: int,
category: SupermarketChain.XMLFilesCategory,
regex_pat: str,
session: requests.session,
):
download_url: str = chain.get_download_url_or_path(store_id, category, session)
if not download_url: # Not found non-full Promos/Prices file if not download_url: # Not found non-full Promos/Prices file
return return
logging.debug(download_url) logging.debug(download_url)
assert re.search(regex_pat, download_url, re.IGNORECASE), f'Invalid {category.name} url in {repr(type(chain))}' assert re.search(
regex_pat, download_url, re.IGNORECASE
), f"Invalid {category.name} url in {repr(type(chain))}"
if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]: if category in [chain.XMLFilesCategory.Prices, chain.XMLFilesCategory.Promos]:
assert not re.search('full', download_url, re.IGNORECASE), \ assert not re.search(
f'Downloaded the full {category.name} file mistakenly in {repr(type(chain))}' "full", download_url, re.IGNORECASE
), f"Downloaded the full {category.name} file mistakenly in {repr(type(chain))}"
def test_promotions_scraping(): @pytest.mark.parametrize("chain_tuple", CHAINS_DICT.items())
def test_promotions_scraping(chain_tuple):
""" """
Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output. Test scraping of promotions is completed successfully and a valid xlsx file is generated as an output.
""" """
filename = 'temp.xlsx' chain_name, chain = chain_tuple
for chain_name, chain in tqdm(chain_dict.items(), desc='chains'): tf = tempfile.NamedTemporaryFile(suffix=".xlsx")
logging.info(f'Test scraping promotions from {chain_name}')
store_id: int = valid_store_id_by_chain(chain_name) logging.info(f"Test scraping promotions from {chain_name}")
try:
main_latest_promos(
store_id=store_id,
output_filename=filename,
chain=chain,
load_promos=False,
load_xml=False,
)
df = pd.read_excel(filename)
except Exception as e:
logging.error(e)
logging.error(f"Failed loading excel of {chain_name}")
raise
assert df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM, f"Failed scraping {chain_name}" store_id: int = valid_store_id_by_chain(chain_name)
try:
main_latest_promos(
store_id=store_id,
output_filename=tf.name,
chain=chain,
load_promos=False,
load_prices=False,
)
df = pd.read_excel(tf.name)
except Exception as e:
logging.error(e)
logging.error(f"Failed loading excel of {chain_name}")
raise
assert (
df.shape[0] > MIN_NUM_OF_PROMOS and df.shape[1] == PROMOTION_COLS_NUM
), f"Failed scraping {chain_name}"
def valid_store_id_by_chain(chain_name) -> int: def valid_store_id_by_chain(chain_name) -> int:
@@ -108,11 +110,11 @@ def valid_store_id_by_chain(chain_name) -> int:
""" """
if chain_name == repr(DorAlon): if chain_name == repr(DorAlon):
store_id = 501 store_id = 501
elif chain_name in [repr(TivTaam), repr(Bareket), repr(ZolVebegadol)]: elif chain_name in [repr(TivTaam), repr(Bareket)]:
store_id = 2 store_id = 2
elif chain_name == repr(CoOp): elif chain_name == repr(CoOp):
store_id = 202 store_id = 202
elif chain_name == repr(ShukHayir): elif chain_name == [repr(ShukHayir), repr(ZolVebegadol)]:
store_id = 4 store_id = 4
elif chain_name in [repr(StopMarket), repr(Keshet)]: elif chain_name in [repr(StopMarket), repr(Keshet)]:
store_id = 5 store_id = 5

105
utils.py
View File

@@ -1,14 +1,16 @@
import gzip import gzip
import io import io
import logging import logging
import os.path
import zipfile import zipfile
from argparse import ArgumentTypeError from argparse import ArgumentTypeError
from datetime import date
from datetime import datetime from datetime import datetime
from os import path
from typing import AnyStr, Dict from typing import AnyStr, Dict
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from os import path
from tqdm import tqdm from tqdm import tqdm
from item import Item from item import Item
@@ -29,12 +31,22 @@ def xml_file_gen(chain: SupermarketChain, store_id: int, category_name: str) ->
:param category_name: A given category name :param category_name: A given category name
:return: An xml filename :return: An xml filename
""" """
store_id_str: str = f"-{str(store_id)}" if SupermarketChain.is_valid_store_id(store_id) else "" store_id_str: str = (
return path.join(RAW_FILES_DIRNAME, f"{repr(type(chain))}-{category_name}{store_id_str}.xml") f"-{str(store_id)}" if SupermarketChain.is_valid_store_id(store_id) else ""
)
return path.join(
RAW_FILES_DIRNAME,
f"{repr(type(chain))}-{category_name}{store_id_str}-{date.today()}.xml",
)
def create_bs_object(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory, def create_bs_object(
load_xml: bool, xml_path: str) -> BeautifulSoup: chain: SupermarketChain,
store_id: int,
category: SupermarketChain.XMLFilesCategory,
load_xml: bool,
xml_path: str,
) -> BeautifulSoup:
""" """
This function creates a BeautifulSoup (BS) object according to the given parameters. This function creates a BeautifulSoup (BS) object according to the given parameters.
In case the given load_xml is True and the XML file exists, the function creates the BS object from the given In case the given load_xml is True and the XML file exists, the function creates the BS object from the given
@@ -53,8 +65,12 @@ def create_bs_object(chain: SupermarketChain, store_id: int, category: Supermark
return get_bs_object_from_link(chain, store_id, category, xml_path) return get_bs_object_from_link(chain, store_id, category, xml_path)
def get_bs_object_from_link(chain: SupermarketChain, store_id: int, category: SupermarketChain.XMLFilesCategory, def get_bs_object_from_link(
xml_path: str) -> BeautifulSoup: chain: SupermarketChain,
store_id: int,
category: SupermarketChain.XMLFilesCategory,
xml_path: str,
) -> BeautifulSoup:
""" """
This function creates a BeautifulSoup (BS) object by generating a download link the given chain's API. This function creates a BeautifulSoup (BS) object by generating a download link the given chain's API.
@@ -65,20 +81,25 @@ def get_bs_object_from_link(chain: SupermarketChain, store_id: int, category: Su
:return: A BeautifulSoup object with xml content. :return: A BeautifulSoup object with xml content.
""" """
session = requests.Session() session = requests.Session()
download_url: str = chain.get_download_url(store_id, category, session) download_url_or_path: str = chain.get_download_url_or_path(store_id, category, session)
if not download_url: if not download_url_or_path:
return BeautifulSoup() return BeautifulSoup()
response_content = session.get(download_url).content if os.path.isfile(download_url_or_path):
try: with gzip.open(download_url_or_path) as fIn:
xml_content: AnyStr = gzip.decompress(response_content) xml_content = fIn.read()
except gzip.BadGzipFile: os.remove(download_url_or_path) # Delete gz file
with zipfile.ZipFile(io.BytesIO(response_content)) as the_zip: else:
zip_info = the_zip.infolist()[0] response_content = session.get(download_url_or_path).content
with the_zip.open(zip_info) as the_file: try:
xml_content = the_file.read() xml_content: AnyStr = gzip.decompress(response_content)
with open(xml_path, 'wb') as f_out: except gzip.BadGzipFile:
with zipfile.ZipFile(io.BytesIO(response_content)) as the_zip:
zip_info = the_zip.infolist()[0]
with the_zip.open(zip_info) as the_file:
xml_content = the_file.read()
with open(xml_path, "wb") as f_out:
f_out.write(xml_content) f_out.write(xml_content)
return BeautifulSoup(xml_content, features='xml') return BeautifulSoup(xml_content, features="xml")
def get_bs_object_from_xml(xml_path: str) -> BeautifulSoup: def get_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
@@ -88,11 +109,13 @@ def get_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
:param xml_path: A given path to an xml file to load/save the BS object from/to. :param xml_path: A given path to an xml file to load/save the BS object from/to.
:return: A BeautifulSoup object with xml content. :return: A BeautifulSoup object with xml content.
""" """
with open(xml_path, 'rb') as f_in: with open(xml_path, "rb") as f_in:
return BeautifulSoup(f_in, features='xml') return BeautifulSoup(f_in, features="xml")
def create_items_dict(chain: SupermarketChain, store_id: int, load_xml) -> Dict[str, Item]: def create_items_dict(
chain: SupermarketChain, store_id: int, load_xml
) -> Dict[str, Item]:
""" """
This function creates a dictionary where every key is an item code and its value is its corresponding Item instance. This function creates a dictionary where every key is an item code and its value is its corresponding Item instance.
We take both full and not full prices files, and assume that the no full is more updated (in case of overwriting). We take both full and not full prices files, and assume that the no full is more updated (in case of overwriting).
@@ -102,16 +125,28 @@ def create_items_dict(chain: SupermarketChain, store_id: int, load_xml) -> Dict[
:param store_id: A given store id :param store_id: A given store id
""" """
items_dict = dict() items_dict = dict()
for category in tqdm([chain.XMLFilesCategory.PricesFull, chain.XMLFilesCategory.Prices], desc='prices_files'): for category in tqdm(
[chain.XMLFilesCategory.PricesFull, chain.XMLFilesCategory.Prices],
desc="prices_files",
):
xml_path: str = xml_file_gen(chain, store_id, category.name) xml_path: str = xml_file_gen(chain, store_id, category.name)
bs_prices: BeautifulSoup = create_bs_object(chain, store_id, category, load_xml, xml_path) bs_prices: BeautifulSoup = create_bs_object(
chain, store_id, category, load_xml, xml_path
)
items_tags = bs_prices.find_all(chain.item_tag_name) items_tags = bs_prices.find_all(chain.item_tag_name)
items_dict.update({item.find('ItemCode').text: chain.get_item_info(item) for item in items_tags}) items_dict.update(
{
item_tag.find("ItemCode").text: Item.from_tag(item_tag)
for item_tag in items_tags
}
)
return items_dict return items_dict
def log_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None: def log_products_prices(
chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str
) -> None:
""" """
This function prints the products in a given store which contains a given product_name. This function prints the products in a given store which contains a given product_name.
@@ -121,8 +156,12 @@ def log_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool,
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one :param load_xml: A boolean representing whether to load an existing xml or load an already saved one
""" """
items_dict: Dict[str, Item] = create_items_dict(chain, store_id, load_xml) items_dict: Dict[str, Item] = create_items_dict(chain, store_id, load_xml)
products_by_name = [item for item in items_dict.values() if product_name in item.name] products_by_name = [
products_by_name_sorted_by_price = sorted(products_by_name, key=lambda item: item.price_by_measure) item for item in items_dict.values() if product_name in item.name
]
products_by_name_sorted_by_price = sorted(
products_by_name, key=lambda item: item.price_by_measure
)
for prod in products_by_name_sorted_by_price: for prod in products_by_name_sorted_by_price:
logging.info(prod) logging.info(prod)
@@ -134,12 +173,16 @@ def get_float_from_tag(tag, int_tag) -> int:
def is_valid_promotion_output_file(output_file: str) -> bool: def is_valid_promotion_output_file(output_file: str) -> bool:
return any(output_file.endswith(extension) for extension in VALID_PROMOTION_FILE_EXTENSIONS) return any(
output_file.endswith(extension) for extension in VALID_PROMOTION_FILE_EXTENSIONS
)
def valid_promotion_output_file(output_file: str) -> str: def valid_promotion_output_file(output_file: str) -> str:
if not is_valid_promotion_output_file(output_file): if not is_valid_promotion_output_file(output_file):
raise ArgumentTypeError(f"Given output file is not a natural number:\n{output_file}") raise ArgumentTypeError(
f"Given output file has an invalid extension is invalid: {output_file}"
)
return output_file return output_file