has added Co-Op chain. Extendability to other chain has significantly improved as well.

This commit is contained in:
KorenLazar
2020-12-17 22:48:10 +02:00
parent 4c0eba1771
commit be47a5ad81
10 changed files with 320 additions and 139 deletions

View File

@@ -1,78 +1,63 @@
import gzip
from enum import Enum
from typing import Dict
from typing import AnyStr, Dict
import requests
from bs4 import BeautifulSoup
from pathlib import Path
from bs4.element import Tag
from os import path
from supermarket_chain import SupermarketChain
import re
LOGS_DIRNAME = "logs"
XMLS_DIRNAME = "xmls"
Path(XMLS_DIRNAME).mkdir(exist_ok=True)
class ShufersalCategories(Enum):
All, Prices, PricesFull, Promos, PromosFull, Stores = range(6)
def xml_file_gen(category_name: str, store_id: int) -> str:
def xml_file_gen(chain: SupermarketChain, store_id: int, category_name: str) -> str:
"""
This function generate an xml filename given a store id and a category_name
This function generate an XML filename given a store id and a category_name
If the given store_id is invalid, it is ignored in the returned string.
:param chain: A given supermarket chain
:param store_id: A given store_id
:param category_name: A given category name
:return: An xml filename
"""
store_id_str = f"-{str(store_id)}" if is_valid_store_id(store_id) else ""
return path.join(XMLS_DIRNAME, f"{category_name}{store_id_str}.xml")
store_id_str: str = f"-{str(store_id)}" if SupermarketChain.is_valid_store_id(store_id) else ""
return path.join(XMLS_DIRNAME, f"{chain}-{category_name}{store_id_str}.xml")
def get_download_url(store_id: int, cat_id: int) -> str:
"""
This function scrapes Shufersal's website and returns a url that contains the data for a given store and category.
For info about the categories, see ShufersalCategories.
:param store_id: A given id of a store
:param cat_id: A given id of a category
:return: A downloadable link of the data for a given store and category
"""
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={cat_id}"
if is_valid_store_id(store_id):
url += f"&storeId={store_id}"
req_res = requests.get(url)
soup = BeautifulSoup(req_res.text, features='lxml')
return soup.find('a', text="לחץ להורדה")['href']
def create_bs_object(xml_path: str, cat_id: int, store_id: int, load_xml: bool) -> BeautifulSoup:
def create_bs_object(xml_path: str, chain: SupermarketChain, store_id: int, load_xml: bool,
category: SupermarketChain.XMLFilesCategory) -> BeautifulSoup:
"""
This function creates a BeautifulSoup (BS) object according to the given parameters.
In case the given load_xml is True and the XML file exists, the function creates the BS object from the given
xml_path, otherwise it uses Shufersal's APIs to download the xml with the relevant content and saves it for
future use.
:param chain: A given supermarket chain
:param xml_path: A given path to an xml file to load/save the BS object from/to.
:param cat_id: A given id of a category from ShufersalCategories
:param category: A given category
:param store_id: A given id of a store
:param load_xml: A flag representing whether to try loading an existing XML file
:return: A BeautifulSoup object with xml content.
"""
if load_xml and path.isfile(xml_path):
return create_bs_object_from_xml(xml_path)
return create_bs_object_from_link(xml_path, store_id, cat_id)
return create_bs_object_from_link(xml_path, chain, category, store_id)
def create_bs_object_from_link(xml_path: str, store_id: int, cat_id: int) -> BeautifulSoup:
def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category: SupermarketChain.XMLFilesCategory,
store_id: int) -> BeautifulSoup:
"""
This function creates a BeautifulSoup (BS) object by generating a download link from Shufersal's API.
:param xml_path: A given path to an xml file to load/save the BS object from/to.
:param chain: A given supermarket chain
:param xml_path: A given path to an XML file to load/save the BS object from/to.
:param store_id: A given id of a store
:param cat_id: A given id of a category from ShufersalCategories
:param category: A given category
:return: A BeautifulSoup object with xml content.
"""
download_url = get_download_url(store_id, cat_id)
xml_content = gzip.decompress(requests.get(download_url).content)
download_url: str = chain.get_download_url(store_id, category)
xml_content: AnyStr = gzip.decompress(requests.get(download_url).content)
with open(xml_path, 'wb') as f_out:
f_out.write(xml_content)
return BeautifulSoup(xml_content, features='xml')
@@ -80,7 +65,7 @@ def create_bs_object_from_link(xml_path: str, store_id: int, cat_id: int) -> Bea
def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
"""
This function creates a BeautifulSoup (BS) object from a given xml file.
This function creates a BeautifulSoup (BS) object from a given XML file.
:param xml_path: A given path to an xml file to load/save the BS object from/to.
:return: A BeautifulSoup object with xml content.
@@ -89,39 +74,41 @@ def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
return BeautifulSoup(f_in, features='xml')
def create_items_dict(store_id: int, load_xml) -> Dict:
def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, str]:
"""
This function creates a dictionary where every key is an item code and its value is the item's name and price.
:param chain: A given supermarket chain
:param load_xml: A boolean representing whether to load an existing prices xml file
:param store_id: A given store id
:return: A dictionary where the firs
"""
xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id)
bs_prices = create_bs_object(xml_path, ShufersalCategories.PricesFull.value, store_id, load_xml)
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all('Item')}
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
def get_item_info(item):
return str((item.find('ItemName').text, item.find('ManufacturerName').text, item.find('ItemPrice').text))
def get_products_prices(store_id: int, product_name: str, load_xml: bool):
def get_item_info(item: Tag) -> str:
"""
This function prints the products in a given Shufersal store which contains a given product_name.
This function returns a string containing important information about a given supermarket's product.
"""
return [item.find('ItemName').text, item.find(re.compile(r'Manufacture[r]?Name')).text,
item.find('ItemPrice').text, item.find('ItemCode').text]
:param store_id: A given Shufersal store id
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
"""
This function prints the products in a given store which contains a given product_name.
:param chain: A given supermarket chain
:param store_id: A given store id
:param product_name: A given product name
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
"""
xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id)
bs_prices = create_bs_object(xml_path, ShufersalCategories.PricesFull.value, store_id, load_xml)
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
for prod in prods:
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
prod.find('ItemPrice').text))
def is_valid_store_id(store_id: int):
return isinstance(store_id, int) and store_id >= 0