has added Co-Op chain. Extendability to other chain has significantly improved as well.

This commit is contained in:
KorenLazar
2020-12-17 22:48:10 +02:00
parent 4c0eba1771
commit be47a5ad81
10 changed files with 320 additions and 139 deletions

1
.gitignore vendored
View File

@@ -8,3 +8,4 @@ stores_*
venv/ venv/
__pycache__/ __pycache__/
xmls/ xmls/
logs/

View File

@@ -1,10 +1,10 @@
# Shufersal basic scraping # Supermarket basic scraping
## Installation ## Installation
clone: clone:
```cmd script ```cmd script
git clone https://github.com/korenLazar/shufersal-scraping.git git clone https://github.com/korenLazar/supermarket-scraping.git
cd shufersal-scraping cd supermarket-scraping
virtualenv venv virtualenv venv
venv\bin\activate venv\bin\activate
pip install -r requirements.txt pip install -r requirements.txt
@@ -16,17 +16,17 @@ pip install -r requirements.txt
2. virtualenv 2. virtualenv
## Usage ## Usage
First, to find your store's id, you can run the following command (assuming you live in Jerusalem): First, to find your Shufersal store's ID, you can run the following command (assuming you live in Jerusalem):
```cmd script ```cmd script
python main.py --find_store ירושלים python main.py --find_store ירושלים --chain Shufersal
``` ```
After running the command, you'll be able to see the different stores in Jerusalem with their ids in "stores_ירושלים.log". After running the command, you'll be able to see the different stores in Jerusalem with their IDs on the screen.
Now, that we have the store's id, we can get its promotions sorted by their update date by running Now, that we have the store's ID, we can get its promotions sorted by their update date by running
```cmd script ```cmd script
python main.py --promos 5 python main.py --promos 5 --chain Shufersal
``` ```
* We assumed that the store's id is 5. * We assumed that the store's ID is 5.
Now, you can find the promos in "promos_5.log". Now, you can find the promos in "promos_5.log".
For other documentation and commands, you can run For other documentation and commands, you can run

36
co_op.py Normal file
View File

@@ -0,0 +1,36 @@
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from supermarket_chain import SupermarketChain
class CoOp(SupermarketChain):
promotion_tag_name = 'Sale'
promotion_update_tag_name = 'PriceUpdateDate'
date_format = '%Y/%m/%d'
date_hour_format = '%Y/%m/%d %H:%M:%S'
item_tag_name = 'Product'
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
prefix = "http://matrixcatalog.co.il/"
url = prefix + "NBCompetitionRegulations.aspx"
req_res: requests.Response = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml')
suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
and f'-{store_id}-20' in value).attrs['href']
down_url = prefix + suffix
print(down_url)
return down_url
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
def __repr__(self):
return 'Co-Op'
@staticmethod
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
promo_item = items_dict.get(promo.find('ItemCode').text)
return [promo_item] if promo_item else []

53
main.py
View File

@@ -1,11 +1,22 @@
from argparse import ArgumentParser from argparse import ArgumentParser
import logging import logging
from promotion import main_latest_promos, get_promos_by_name from promotion import main_latest_promos, get_promos_by_name
from store import get_store_id, store_id_type from store_utils import get_store_id
from utils import get_products_prices from utils import LOGS_DIRNAME, XMLS_DIRNAME, get_products_prices
from supermarket_chain import SupermarketChain
from shufersal import ShuferSal
from co_op import CoOp
from pathlib import Path
# TODO: improve extendability: support addition of different supermarket chains # TODO: fix problem of left-to-right printing
# TODO: fix problem of left-to-right printing in Windows' cmd
Path(LOGS_DIRNAME).mkdir(exist_ok=True)
Path(XMLS_DIRNAME).mkdir(exist_ok=True)
chain_dict = {
'Shufersal': ShuferSal(),
'Co-Op': CoOp(),
}
if __name__ == '__main__': if __name__ == '__main__':
parser = ArgumentParser() parser = ArgumentParser()
@@ -13,7 +24,7 @@ if __name__ == '__main__':
help="generates a promos_{store_id}.log file with all the promotions in the requested store", help="generates a promos_{store_id}.log file with all the promotions in the requested store",
metavar='store_id', metavar='store_id',
nargs=1, nargs=1,
type=store_id_type, type=SupermarketChain.store_id_type,
) )
parser.add_argument('--find_promos_by_name', parser.add_argument('--find_promos_by_name',
help="prints all promos containing the given promo_name in the given store", help="prints all promos containing the given promo_name in the given store",
@@ -31,30 +42,44 @@ if __name__ == '__main__':
metavar='city', metavar='city',
nargs=1, nargs=1,
) )
parser.add_argument('--load_xml', parser.add_argument('--load_prices',
help='boolean flag representing whether to load an existing xml', help='boolean flag representing whether to load an existing price XML file',
action='store_true', action='store_true',
) )
parser.add_argument('--load_promos',
help='boolean flag representing whether to load an existing promo XML file',
action='store_true',
)
parser.add_argument('--load_stores',
help='boolean flag representing whether to load an existing stores XML file',
action='store_true',
)
parser.add_argument('--chain',
required=True,
help='The name of the requested chain',
choices=['Shufersal', 'Co-Op'],
)
args = parser.parse_args() args = parser.parse_args()
chain: SupermarketChain = chain_dict[args.chain]
if args.promos: if args.promos:
arg_store_id = int(args.promos[0]) arg_store_id = int(args.promos[0])
logger = logging.getLogger() logger = logging.getLogger()
logger.setLevel(logging.INFO) logger.setLevel(logging.INFO)
handler = logging.FileHandler(filename=f'promos_{arg_store_id}.log', mode='w', encoding='utf-8') handler = logging.FileHandler(filename=f'logs/{args.chain}_promos_{arg_store_id}.log', mode='w',
encoding='utf-8')
logger.addHandler(handler) logger.addHandler(handler)
main_latest_promos(store_id=arg_store_id, main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain)
load_xml=args.load_xml,
logger=logger)
elif args.price: elif args.price:
get_products_prices(store_id=args.price[0], product_name=args.price[1], load_xml=args.load_xml) get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])
elif args.find_store_id: elif args.find_store_id:
arg_city = args.find_store_id[0] arg_city = args.find_store_id[0]
get_store_id(city=arg_city, load_xml=args.load_xml) get_store_id(city=arg_city, load_xml=args.load_stores, chain=chain)
elif args.find_promos_by_name: elif args.find_promos_by_name:
arg_store_id = int(args.find_promos_by_name[0]) arg_store_id = int(args.find_promos_by_name[0])
get_promos_by_name(store_id=arg_store_id, load_xml=args.load_xml, promo_name=args.find_promos_by_name[1]) get_promos_by_name(store_id=arg_store_id, chain=chain, promo_name=args.find_promos_by_name[1],
load_prices=args.load_prices, load_promos=args.load_promos)

View File

@@ -1,11 +1,13 @@
from datetime import datetime from datetime import datetime
from typing import List from typing import Dict, List
from bs4 import BeautifulSoup
from utils import ( from utils import (
ShufersalCategories,
create_items_dict, create_items_dict,
xml_file_gen, xml_file_gen,
create_bs_object, create_bs_object,
) )
from supermarket_chain import SupermarketChain
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל'] PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל']
@@ -23,66 +25,93 @@ class Promotion:
self.update_date: datetime = update_date self.update_date: datetime = update_date
self.items: List[str] = items self.items: List[str] = items
def __str__(self): def __repr__(self):
title = self.content title = self.content
dates_range = f"Between {self.start_date.date()} and {self.end_date.date()}" dates_range = f"Between {self.start_date} and {self.end_date}"
update_line = f"Updated at {self.update_date.date()}" update_line = f"Updated at {self.update_date}"
items = '\n'.join(str(item) for item in self.items) items = '\n'.join(str(item) for item in self.items)
return '\n'.join([title, dates_range, update_line, items]) + '\n' return '\n'.join([title, dates_range, update_line, items]) + '\n'
def repr_ltr(self):
title = self.content[::-1]
dates_range = f"Between {self.start_date} and {self.end_date}"
update_line = f"Updated at {self.update_date}"
items = '\n'.join(str(item)[::-1] for item in self.items)
return '\n'.join([title, dates_range, update_line, items]) + '\n'
def get_available_promos(store_id: int, load_xml: bool) -> List[Promotion]:
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]:
""" """
This function return the available promotions given a BeautifulSoup object. This function return the available promotions given a BeautifulSoup object.
:param load_promos:
:param chain: The name of the requested supermarket chain
:param store_id: A given store id :param store_id: A given store id
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one :param load_prices: A boolean representing whether to load an existing xml or load an already saved one
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available :return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
""" """
items_dict = create_items_dict(store_id, load_xml) items_dict: Dict[str, str] = create_items_dict(chain, load_prices, store_id)
xml_path = xml_file_gen(ShufersalCategories.PromosFull.name, store_id) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
bs_promos = create_bs_object(xml_path, ShufersalCategories.PromosFull.value, store_id, False) bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull)
promo_objs = list() promo_objs = list()
for cur_promo in bs_promos.find_all("Promotion"): for promo in bs_promos.find_all(chain.promotion_tag_name):
cur_promo = Promotion( promo = Promotion(
content=cur_promo.find('PromotionDescription').text, content=promo.find('PromotionDescription').text,
start_date=datetime.strptime(cur_promo.find('PromotionStartDate').text, '%Y-%m-%d'), start_date=datetime.strptime(
end_date=datetime.strptime(cur_promo.find('PromotionEndDate').text, '%Y-%m-%d'), promo.find('PromotionStartDate').text + ' ' + promo.find('PromotionStartHour').text,
update_date=datetime.strptime(cur_promo.find('PromotionUpdateDate').text, '%Y-%m-%d %H:%M'), chain.date_hour_format),
items=[items_dict.get(item.find('ItemCode').text) for item in cur_promo.find_all('Item') end_date=datetime.strptime(promo.find(
if items_dict.get(item.find('ItemCode').text)], 'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format),
items=chain.get_items(promo, items_dict),
) )
if is_valid_promo(cur_promo): if is_valid_promo(promo):
promo_objs.append(cur_promo) promo_objs.append(promo)
return promo_objs return promo_objs
def is_valid_promo(promo: Promotion): def is_valid_promo(promo: Promotion):
today_date = datetime.now() """
not_expired = promo.end_date.date() >= today_date.date() This function checks if a given promo object is valid.
has_started = promo.start_date <= today_date
has_products = len(promo.items) > 0 :param promo: A given promotion
in_promo_ignore_list = any(product in promo.content for product in PRODUCTS_TO_IGNORE) :return: True iff the given Promotion is valid.
"""
today_date: datetime = datetime.now()
not_expired: bool = promo.end_date >= today_date
has_started: bool = promo.start_date <= today_date
has_products: bool = len(promo.items) > 0
in_promo_ignore_list: bool = any(product in promo.content for product in PRODUCTS_TO_IGNORE)
return not_expired and has_started and has_products and not in_promo_ignore_list return not_expired and has_started and has_products and not in_promo_ignore_list
def main_latest_promos(store_id: int, load_xml: bool, logger): def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
""" """
This function logs the available promos in a store with a given id sorted by their update date. This function logs the available promos in a store with a given id sorted by their update date.
:param chain: The name of the requested supermarket chain
:param store_id: A given store id :param store_id: A given store id
:param load_xml: A boolean representing whether to load an existing prices xml file :param load_xml: A boolean representing whether to load an existing prices xml file
:param logger: A given logger :param logger: A given logger
""" """
promotions = get_available_promos(store_id, load_xml) promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False)
promotions.sort(key=lambda promo: max(promo.update_date, promo.start_date), reverse=True) promotions.sort(key=lambda promo: max(promo.update_date, promo.start_date), reverse=True)
logger.info('\n'.join(str(promotion) for promotion in promotions)) logger.info('\n'.join(str(promotion) for promotion in promotions))
def get_promos_by_name(store_id: int, load_xml: bool, promo_name: str): def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool):
promotions = get_available_promos(store_id, load_xml) """
This function prints all promotions in a given chain and store_id containing a given promo_name.
:param store_id: A given store ID
:param chain: A given supermarket chain
:param promo_name: A given name of a promo (or part of it)
:param load_prices: A boolean representing whether to load an saved prices XML file or scrape a new one
:param load_promos: A boolean representing whether to load an saved XML file or scrape a new one
"""
promotions: List[Promotion] = get_available_promos(chain, store_id, load_prices, load_promos)
for promo in promotions: for promo in promotions:
if promo_name in promo.content: if promo_name in promo.content:
print(str(promo)) print(promo.repr_ltr())

33
shufersal.py Normal file
View File

@@ -0,0 +1,33 @@
from typing import Dict, List
import requests
from bs4 import BeautifulSoup
from bs4.element import Tag
from supermarket_chain import SupermarketChain
class ShuferSal(SupermarketChain):
promotion_tag_name = 'Promotion'
promotion_update_tag_name = 'PromotionUpdateDate'
date_format = '%Y-%m-%d'
date_hour_format = '%Y-%m-%d %H:%M'
item_tag_name = 'Item'
@staticmethod
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
if SupermarketChain.is_valid_store_id(store_id):
url += f"&storeId={store_id}"
req_res: requests.Response = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml')
return soup.find('a', text="לחץ להורדה")['href']
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
All, Prices, PricesFull, Promos, PromosFull, Stores = range(6)
def __repr__(self):
return 'Shufersal'
@staticmethod
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
return [items_dict.get(item.find('ItemCode').text) for item in promo.find_all('Item')
if items_dict.get(item.find('ItemCode').text)]

View File

@@ -1,30 +0,0 @@
from argparse import ArgumentTypeError
from utils import (
ShufersalCategories,
is_valid_store_id,
xml_file_gen,
create_bs_object,
)
def store_id_type(store_id: str):
if not is_valid_store_id(int(store_id)):
raise ArgumentTypeError(f"Given store_id: {store_id} is not a valid store_id.")
return store_id
def get_store_id(city: str, load_xml: bool):
"""
This function prints the store_ids of Shufersal stores in a given city.
The city must match exactly to its spelling in Shufersal's website (hence it should be in Hebrew alphabet).
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
:param city: A string representing the city of the requested store.
"""
xml_path = xml_file_gen(ShufersalCategories.Stores.name, -1)
bs_stores = create_bs_object(xml_path, ShufersalCategories.Stores.value, -1, load_xml)
for store in bs_stores.find_all("STORE"):
if store.find("CITY").text == city:
print((store.find("ADDRESS").text[::-1], store.find("STOREID").text, store.find("SUBCHAINNAME").text[::-1]))

20
store_utils.py Normal file
View File

@@ -0,0 +1,20 @@
from utils import xml_file_gen, create_bs_object
from supermarket_chain import SupermarketChain
from bs4 import BeautifulSoup
def get_store_id(city: str, load_xml: bool, chain: SupermarketChain):
"""
This function prints the store_ids of stores in a given city.
The city must match exactly to its spelling in Shufersal's website (hence it should be in Hebrew alphabet).
:param chain: A given supermarket chain
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
:param city: A string representing the city of the requested store.
"""
xml_path: str = xml_file_gen(chain, -1, chain.XMLFilesCategory.Stores.name)
bs_stores: BeautifulSoup = create_bs_object(xml_path, chain, -1, load_xml, chain.XMLFilesCategory.Stores)
for store in bs_stores.find_all("STORE"):
if store.find("CITY").text == city:
print((store.find("ADDRESS").text, store.find("STOREID").text, store.find("SUBCHAINNAME").text))

80
supermarket_chain.py Normal file
View File

@@ -0,0 +1,80 @@
from abc import abstractmethod
from enum import Enum
from argparse import ArgumentTypeError
from typing import Dict, List
from bs4.element import Tag
class SupermarketChain:
"""
A class representing a supermarket chain.
"""
class XMLFilesCategory(Enum):
"""
An enum class of different XML files produced by a supermarket chain
"""
pass
@property
@abstractmethod
def promotion_tag_name(self): pass
@property
@abstractmethod
def promotion_update_tag_name(self): pass
@property
@abstractmethod
def date_format(self): pass
@property
@abstractmethod
def date_hour_format(self): pass
@property
@abstractmethod
def item_tag_name(self): pass
@staticmethod
def is_valid_store_id(store_id: int) -> bool:
"""
This method returns whether a given store ID is valid (French Natural number).
:param store_id: A given store ID
"""
return isinstance(store_id, int) and store_id >= 0
@staticmethod
def store_id_type(store_id: str) -> str:
"""
This method used as a type verification for store_id.
:param store_id: A given store ID
:return: The given store_id if valid, else raise an ArgumentTypeError.
"""
if not SupermarketChain.is_valid_store_id(int(store_id)):
raise ArgumentTypeError(f"Given store_id: {store_id} is not a valid store_id.")
return store_id
@staticmethod
@abstractmethod
def get_download_url(store_id: int, category: XMLFilesCategory) -> str:
"""
This method scrapes supermarket's website and returns a url containing the data for a given store and category.
:param store_id: A given id of a store
:param category: A given category
:return: A downloadable link of the data for a given store and category
"""
pass
@staticmethod
@abstractmethod
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
"""
This method returns a list of the items that participate in a given promo
:param promo: A given promo
:param items_dict: A given dictionary of products
"""
pass

View File

@@ -1,78 +1,63 @@
import gzip import gzip
from enum import Enum from typing import AnyStr, Dict
from typing import Dict
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from pathlib import Path from bs4.element import Tag
from os import path from os import path
from supermarket_chain import SupermarketChain
import re
LOGS_DIRNAME = "logs"
XMLS_DIRNAME = "xmls" XMLS_DIRNAME = "xmls"
Path(XMLS_DIRNAME).mkdir(exist_ok=True)
class ShufersalCategories(Enum): def xml_file_gen(chain: SupermarketChain, store_id: int, category_name: str) -> str:
All, Prices, PricesFull, Promos, PromosFull, Stores = range(6)
def xml_file_gen(category_name: str, store_id: int) -> str:
""" """
This function generate an xml filename given a store id and a category_name This function generate an XML filename given a store id and a category_name
If the given store_id is invalid, it is ignored in the returned string. If the given store_id is invalid, it is ignored in the returned string.
:param chain: A given supermarket chain
:param store_id: A given store_id :param store_id: A given store_id
:param category_name: A given category name :param category_name: A given category name
:return: An xml filename :return: An xml filename
""" """
store_id_str = f"-{str(store_id)}" if is_valid_store_id(store_id) else "" store_id_str: str = f"-{str(store_id)}" if SupermarketChain.is_valid_store_id(store_id) else ""
return path.join(XMLS_DIRNAME, f"{category_name}{store_id_str}.xml") return path.join(XMLS_DIRNAME, f"{chain}-{category_name}{store_id_str}.xml")
def get_download_url(store_id: int, cat_id: int) -> str: def create_bs_object(xml_path: str, chain: SupermarketChain, store_id: int, load_xml: bool,
""" category: SupermarketChain.XMLFilesCategory) -> BeautifulSoup:
This function scrapes Shufersal's website and returns a url that contains the data for a given store and category.
For info about the categories, see ShufersalCategories.
:param store_id: A given id of a store
:param cat_id: A given id of a category
:return: A downloadable link of the data for a given store and category
"""
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={cat_id}"
if is_valid_store_id(store_id):
url += f"&storeId={store_id}"
req_res = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml')
return soup.find('a', text="לחץ להורדה")['href']
def create_bs_object(xml_path: str, cat_id: int, store_id: int, load_xml: bool) -> BeautifulSoup:
""" """
This function creates a BeautifulSoup (BS) object according to the given parameters. This function creates a BeautifulSoup (BS) object according to the given parameters.
In case the given load_xml is True and the XML file exists, the function creates the BS object from the given In case the given load_xml is True and the XML file exists, the function creates the BS object from the given
xml_path, otherwise it uses Shufersal's APIs to download the xml with the relevant content and saves it for xml_path, otherwise it uses Shufersal's APIs to download the xml with the relevant content and saves it for
future use. future use.
:param chain: A given supermarket chain
:param xml_path: A given path to an xml file to load/save the BS object from/to. :param xml_path: A given path to an xml file to load/save the BS object from/to.
:param cat_id: A given id of a category from ShufersalCategories :param category: A given category
:param store_id: A given id of a store :param store_id: A given id of a store
:param load_xml: A flag representing whether to try loading an existing XML file :param load_xml: A flag representing whether to try loading an existing XML file
:return: A BeautifulSoup object with xml content. :return: A BeautifulSoup object with xml content.
""" """
if load_xml and path.isfile(xml_path): if load_xml and path.isfile(xml_path):
return create_bs_object_from_xml(xml_path) return create_bs_object_from_xml(xml_path)
return create_bs_object_from_link(xml_path, store_id, cat_id) return create_bs_object_from_link(xml_path, chain, category, store_id)
def create_bs_object_from_link(xml_path: str, store_id: int, cat_id: int) -> BeautifulSoup: def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category: SupermarketChain.XMLFilesCategory,
store_id: int) -> BeautifulSoup:
""" """
This function creates a BeautifulSoup (BS) object by generating a download link from Shufersal's API. This function creates a BeautifulSoup (BS) object by generating a download link from Shufersal's API.
:param xml_path: A given path to an xml file to load/save the BS object from/to. :param chain: A given supermarket chain
:param xml_path: A given path to an XML file to load/save the BS object from/to.
:param store_id: A given id of a store :param store_id: A given id of a store
:param cat_id: A given id of a category from ShufersalCategories :param category: A given category
:return: A BeautifulSoup object with xml content. :return: A BeautifulSoup object with xml content.
""" """
download_url = get_download_url(store_id, cat_id) download_url: str = chain.get_download_url(store_id, category)
xml_content = gzip.decompress(requests.get(download_url).content) xml_content: AnyStr = gzip.decompress(requests.get(download_url).content)
with open(xml_path, 'wb') as f_out: with open(xml_path, 'wb') as f_out:
f_out.write(xml_content) f_out.write(xml_content)
return BeautifulSoup(xml_content, features='xml') return BeautifulSoup(xml_content, features='xml')
@@ -80,7 +65,7 @@ def create_bs_object_from_link(xml_path: str, store_id: int, cat_id: int) -> Bea
def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup: def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
""" """
This function creates a BeautifulSoup (BS) object from a given xml file. This function creates a BeautifulSoup (BS) object from a given XML file.
:param xml_path: A given path to an xml file to load/save the BS object from/to. :param xml_path: A given path to an xml file to load/save the BS object from/to.
:return: A BeautifulSoup object with xml content. :return: A BeautifulSoup object with xml content.
@@ -89,39 +74,41 @@ def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
return BeautifulSoup(f_in, features='xml') return BeautifulSoup(f_in, features='xml')
def create_items_dict(store_id: int, load_xml) -> Dict: def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, str]:
""" """
This function creates a dictionary where every key is an item code and its value is the item's name and price. This function creates a dictionary where every key is an item code and its value is the item's name and price.
:param chain: A given supermarket chain
:param load_xml: A boolean representing whether to load an existing prices xml file :param load_xml: A boolean representing whether to load an existing prices xml file
:param store_id: A given store id :param store_id: A given store id
:return: A dictionary where the firs :return: A dictionary where the firs
""" """
xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
bs_prices = create_bs_object(xml_path, ShufersalCategories.PricesFull.value, store_id, load_xml) bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all('Item')} return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
def get_item_info(item): def get_item_info(item: Tag) -> str:
return str((item.find('ItemName').text, item.find('ManufacturerName').text, item.find('ItemPrice').text))
def get_products_prices(store_id: int, product_name: str, load_xml: bool):
""" """
This function prints the products in a given Shufersal store which contains a given product_name. This function returns a string containing important information about a given supermarket's product.
"""
return [item.find('ItemName').text, item.find(re.compile(r'Manufacture[r]?Name')).text,
item.find('ItemPrice').text, item.find('ItemCode').text]
:param store_id: A given Shufersal store id
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
"""
This function prints the products in a given store which contains a given product_name.
:param chain: A given supermarket chain
:param store_id: A given store id
:param product_name: A given product name :param product_name: A given product name
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one :param load_xml: A boolean representing whether to load an existing xml or load an already saved one
""" """
xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
bs_prices = create_bs_object(xml_path, ShufersalCategories.PricesFull.value, store_id, load_xml) bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text] prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text)) prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
for prod in prods: for prod in prods:
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1], print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
prod.find('ItemPrice').text)) prod.find('ItemPrice').text))
def is_valid_store_id(store_id: int):
return isinstance(store_id, int) and store_id >= 0