has added Co-Op chain. Extendability to other chain has significantly improved as well.
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -8,3 +8,4 @@ stores_*
|
|||||||
venv/
|
venv/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
xmls/
|
xmls/
|
||||||
|
logs/
|
||||||
|
18
README.md
18
README.md
@@ -1,10 +1,10 @@
|
|||||||
# Shufersal basic scraping
|
# Supermarket basic scraping
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
clone:
|
clone:
|
||||||
```cmd script
|
```cmd script
|
||||||
git clone https://github.com/korenLazar/shufersal-scraping.git
|
git clone https://github.com/korenLazar/supermarket-scraping.git
|
||||||
cd shufersal-scraping
|
cd supermarket-scraping
|
||||||
virtualenv venv
|
virtualenv venv
|
||||||
venv\bin\activate
|
venv\bin\activate
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
@@ -16,17 +16,17 @@ pip install -r requirements.txt
|
|||||||
2. virtualenv
|
2. virtualenv
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
First, to find your store's id, you can run the following command (assuming you live in Jerusalem):
|
First, to find your Shufersal store's ID, you can run the following command (assuming you live in Jerusalem):
|
||||||
```cmd script
|
```cmd script
|
||||||
python main.py --find_store ירושלים
|
python main.py --find_store ירושלים --chain Shufersal
|
||||||
```
|
```
|
||||||
After running the command, you'll be able to see the different stores in Jerusalem with their ids in "stores_ירושלים.log".
|
After running the command, you'll be able to see the different stores in Jerusalem with their IDs on the screen.
|
||||||
|
|
||||||
Now, that we have the store's id, we can get its promotions sorted by their update date by running
|
Now, that we have the store's ID, we can get its promotions sorted by their update date by running
|
||||||
```cmd script
|
```cmd script
|
||||||
python main.py --promos 5
|
python main.py --promos 5 --chain Shufersal
|
||||||
```
|
```
|
||||||
* We assumed that the store's id is 5.
|
* We assumed that the store's ID is 5.
|
||||||
Now, you can find the promos in "promos_5.log".
|
Now, you can find the promos in "promos_5.log".
|
||||||
|
|
||||||
For other documentation and commands, you can run
|
For other documentation and commands, you can run
|
||||||
|
36
co_op.py
Normal file
36
co_op.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
from typing import Dict, List
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
|
|
||||||
|
class CoOp(SupermarketChain):
|
||||||
|
promotion_tag_name = 'Sale'
|
||||||
|
promotion_update_tag_name = 'PriceUpdateDate'
|
||||||
|
date_format = '%Y/%m/%d'
|
||||||
|
date_hour_format = '%Y/%m/%d %H:%M:%S'
|
||||||
|
item_tag_name = 'Product'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
||||||
|
prefix = "http://matrixcatalog.co.il/"
|
||||||
|
url = prefix + "NBCompetitionRegulations.aspx"
|
||||||
|
req_res: requests.Response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(req_res.text, features='lxml')
|
||||||
|
suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
|
||||||
|
and f'-{store_id}-20' in value).attrs['href']
|
||||||
|
down_url = prefix + suffix
|
||||||
|
print(down_url)
|
||||||
|
return down_url
|
||||||
|
|
||||||
|
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
|
||||||
|
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'Co-Op'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
||||||
|
promo_item = items_dict.get(promo.find('ItemCode').text)
|
||||||
|
return [promo_item] if promo_item else []
|
53
main.py
53
main.py
@@ -1,11 +1,22 @@
|
|||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
import logging
|
import logging
|
||||||
from promotion import main_latest_promos, get_promos_by_name
|
from promotion import main_latest_promos, get_promos_by_name
|
||||||
from store import get_store_id, store_id_type
|
from store_utils import get_store_id
|
||||||
from utils import get_products_prices
|
from utils import LOGS_DIRNAME, XMLS_DIRNAME, get_products_prices
|
||||||
|
from supermarket_chain import SupermarketChain
|
||||||
|
from shufersal import ShuferSal
|
||||||
|
from co_op import CoOp
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
# TODO: improve extendability: support addition of different supermarket chains
|
# TODO: fix problem of left-to-right printing
|
||||||
# TODO: fix problem of left-to-right printing in Windows' cmd
|
|
||||||
|
Path(LOGS_DIRNAME).mkdir(exist_ok=True)
|
||||||
|
Path(XMLS_DIRNAME).mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
chain_dict = {
|
||||||
|
'Shufersal': ShuferSal(),
|
||||||
|
'Co-Op': CoOp(),
|
||||||
|
}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
parser = ArgumentParser()
|
parser = ArgumentParser()
|
||||||
@@ -13,7 +24,7 @@ if __name__ == '__main__':
|
|||||||
help="generates a promos_{store_id}.log file with all the promotions in the requested store",
|
help="generates a promos_{store_id}.log file with all the promotions in the requested store",
|
||||||
metavar='store_id',
|
metavar='store_id',
|
||||||
nargs=1,
|
nargs=1,
|
||||||
type=store_id_type,
|
type=SupermarketChain.store_id_type,
|
||||||
)
|
)
|
||||||
parser.add_argument('--find_promos_by_name',
|
parser.add_argument('--find_promos_by_name',
|
||||||
help="prints all promos containing the given promo_name in the given store",
|
help="prints all promos containing the given promo_name in the given store",
|
||||||
@@ -31,30 +42,44 @@ if __name__ == '__main__':
|
|||||||
metavar='city',
|
metavar='city',
|
||||||
nargs=1,
|
nargs=1,
|
||||||
)
|
)
|
||||||
parser.add_argument('--load_xml',
|
parser.add_argument('--load_prices',
|
||||||
help='boolean flag representing whether to load an existing xml',
|
help='boolean flag representing whether to load an existing price XML file',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
)
|
)
|
||||||
|
parser.add_argument('--load_promos',
|
||||||
|
help='boolean flag representing whether to load an existing promo XML file',
|
||||||
|
action='store_true',
|
||||||
|
)
|
||||||
|
parser.add_argument('--load_stores',
|
||||||
|
help='boolean flag representing whether to load an existing stores XML file',
|
||||||
|
action='store_true',
|
||||||
|
)
|
||||||
|
parser.add_argument('--chain',
|
||||||
|
required=True,
|
||||||
|
help='The name of the requested chain',
|
||||||
|
choices=['Shufersal', 'Co-Op'],
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
chain: SupermarketChain = chain_dict[args.chain]
|
||||||
if args.promos:
|
if args.promos:
|
||||||
arg_store_id = int(args.promos[0])
|
arg_store_id = int(args.promos[0])
|
||||||
|
|
||||||
logger = logging.getLogger()
|
logger = logging.getLogger()
|
||||||
logger.setLevel(logging.INFO)
|
logger.setLevel(logging.INFO)
|
||||||
handler = logging.FileHandler(filename=f'promos_{arg_store_id}.log', mode='w', encoding='utf-8')
|
handler = logging.FileHandler(filename=f'logs/{args.chain}_promos_{arg_store_id}.log', mode='w',
|
||||||
|
encoding='utf-8')
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
main_latest_promos(store_id=arg_store_id,
|
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain)
|
||||||
load_xml=args.load_xml,
|
|
||||||
logger=logger)
|
|
||||||
|
|
||||||
elif args.price:
|
elif args.price:
|
||||||
get_products_prices(store_id=args.price[0], product_name=args.price[1], load_xml=args.load_xml)
|
get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])
|
||||||
|
|
||||||
elif args.find_store_id:
|
elif args.find_store_id:
|
||||||
arg_city = args.find_store_id[0]
|
arg_city = args.find_store_id[0]
|
||||||
get_store_id(city=arg_city, load_xml=args.load_xml)
|
get_store_id(city=arg_city, load_xml=args.load_stores, chain=chain)
|
||||||
|
|
||||||
elif args.find_promos_by_name:
|
elif args.find_promos_by_name:
|
||||||
arg_store_id = int(args.find_promos_by_name[0])
|
arg_store_id = int(args.find_promos_by_name[0])
|
||||||
get_promos_by_name(store_id=arg_store_id, load_xml=args.load_xml, promo_name=args.find_promos_by_name[1])
|
get_promos_by_name(store_id=arg_store_id, chain=chain, promo_name=args.find_promos_by_name[1],
|
||||||
|
load_prices=args.load_prices, load_promos=args.load_promos)
|
||||||
|
89
promotion.py
89
promotion.py
@@ -1,11 +1,13 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List
|
from typing import Dict, List
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from utils import (
|
from utils import (
|
||||||
ShufersalCategories,
|
|
||||||
create_items_dict,
|
create_items_dict,
|
||||||
xml_file_gen,
|
xml_file_gen,
|
||||||
create_bs_object,
|
create_bs_object,
|
||||||
)
|
)
|
||||||
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל']
|
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל']
|
||||||
|
|
||||||
@@ -23,66 +25,93 @@ class Promotion:
|
|||||||
self.update_date: datetime = update_date
|
self.update_date: datetime = update_date
|
||||||
self.items: List[str] = items
|
self.items: List[str] = items
|
||||||
|
|
||||||
def __str__(self):
|
def __repr__(self):
|
||||||
title = self.content
|
title = self.content
|
||||||
dates_range = f"Between {self.start_date.date()} and {self.end_date.date()}"
|
dates_range = f"Between {self.start_date} and {self.end_date}"
|
||||||
update_line = f"Updated at {self.update_date.date()}"
|
update_line = f"Updated at {self.update_date}"
|
||||||
items = '\n'.join(str(item) for item in self.items)
|
items = '\n'.join(str(item) for item in self.items)
|
||||||
return '\n'.join([title, dates_range, update_line, items]) + '\n'
|
return '\n'.join([title, dates_range, update_line, items]) + '\n'
|
||||||
|
|
||||||
|
def repr_ltr(self):
|
||||||
|
title = self.content[::-1]
|
||||||
|
dates_range = f"Between {self.start_date} and {self.end_date}"
|
||||||
|
update_line = f"Updated at {self.update_date}"
|
||||||
|
items = '\n'.join(str(item)[::-1] for item in self.items)
|
||||||
|
return '\n'.join([title, dates_range, update_line, items]) + '\n'
|
||||||
|
|
||||||
def get_available_promos(store_id: int, load_xml: bool) -> List[Promotion]:
|
|
||||||
|
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]:
|
||||||
"""
|
"""
|
||||||
This function return the available promotions given a BeautifulSoup object.
|
This function return the available promotions given a BeautifulSoup object.
|
||||||
|
|
||||||
|
:param load_promos:
|
||||||
|
:param chain: The name of the requested supermarket chain
|
||||||
:param store_id: A given store id
|
:param store_id: A given store id
|
||||||
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
|
:param load_prices: A boolean representing whether to load an existing xml or load an already saved one
|
||||||
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
|
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
|
||||||
"""
|
"""
|
||||||
items_dict = create_items_dict(store_id, load_xml)
|
items_dict: Dict[str, str] = create_items_dict(chain, load_prices, store_id)
|
||||||
xml_path = xml_file_gen(ShufersalCategories.PromosFull.name, store_id)
|
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
|
||||||
bs_promos = create_bs_object(xml_path, ShufersalCategories.PromosFull.value, store_id, False)
|
bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull)
|
||||||
|
|
||||||
promo_objs = list()
|
promo_objs = list()
|
||||||
for cur_promo in bs_promos.find_all("Promotion"):
|
for promo in bs_promos.find_all(chain.promotion_tag_name):
|
||||||
cur_promo = Promotion(
|
promo = Promotion(
|
||||||
content=cur_promo.find('PromotionDescription').text,
|
content=promo.find('PromotionDescription').text,
|
||||||
start_date=datetime.strptime(cur_promo.find('PromotionStartDate').text, '%Y-%m-%d'),
|
start_date=datetime.strptime(
|
||||||
end_date=datetime.strptime(cur_promo.find('PromotionEndDate').text, '%Y-%m-%d'),
|
promo.find('PromotionStartDate').text + ' ' + promo.find('PromotionStartHour').text,
|
||||||
update_date=datetime.strptime(cur_promo.find('PromotionUpdateDate').text, '%Y-%m-%d %H:%M'),
|
chain.date_hour_format),
|
||||||
items=[items_dict.get(item.find('ItemCode').text) for item in cur_promo.find_all('Item')
|
end_date=datetime.strptime(promo.find(
|
||||||
if items_dict.get(item.find('ItemCode').text)],
|
'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
|
||||||
|
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format),
|
||||||
|
items=chain.get_items(promo, items_dict),
|
||||||
)
|
)
|
||||||
if is_valid_promo(cur_promo):
|
if is_valid_promo(promo):
|
||||||
promo_objs.append(cur_promo)
|
promo_objs.append(promo)
|
||||||
return promo_objs
|
return promo_objs
|
||||||
|
|
||||||
|
|
||||||
def is_valid_promo(promo: Promotion):
|
def is_valid_promo(promo: Promotion):
|
||||||
today_date = datetime.now()
|
"""
|
||||||
not_expired = promo.end_date.date() >= today_date.date()
|
This function checks if a given promo object is valid.
|
||||||
has_started = promo.start_date <= today_date
|
|
||||||
has_products = len(promo.items) > 0
|
:param promo: A given promotion
|
||||||
in_promo_ignore_list = any(product in promo.content for product in PRODUCTS_TO_IGNORE)
|
:return: True iff the given Promotion is valid.
|
||||||
|
"""
|
||||||
|
today_date: datetime = datetime.now()
|
||||||
|
not_expired: bool = promo.end_date >= today_date
|
||||||
|
has_started: bool = promo.start_date <= today_date
|
||||||
|
has_products: bool = len(promo.items) > 0
|
||||||
|
in_promo_ignore_list: bool = any(product in promo.content for product in PRODUCTS_TO_IGNORE)
|
||||||
return not_expired and has_started and has_products and not in_promo_ignore_list
|
return not_expired and has_started and has_products and not in_promo_ignore_list
|
||||||
|
|
||||||
|
|
||||||
def main_latest_promos(store_id: int, load_xml: bool, logger):
|
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
|
||||||
"""
|
"""
|
||||||
This function logs the available promos in a store with a given id sorted by their update date.
|
This function logs the available promos in a store with a given id sorted by their update date.
|
||||||
|
|
||||||
|
:param chain: The name of the requested supermarket chain
|
||||||
:param store_id: A given store id
|
:param store_id: A given store id
|
||||||
:param load_xml: A boolean representing whether to load an existing prices xml file
|
:param load_xml: A boolean representing whether to load an existing prices xml file
|
||||||
:param logger: A given logger
|
:param logger: A given logger
|
||||||
"""
|
"""
|
||||||
|
|
||||||
promotions = get_available_promos(store_id, load_xml)
|
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False)
|
||||||
promotions.sort(key=lambda promo: max(promo.update_date, promo.start_date), reverse=True)
|
promotions.sort(key=lambda promo: max(promo.update_date, promo.start_date), reverse=True)
|
||||||
logger.info('\n'.join(str(promotion) for promotion in promotions))
|
logger.info('\n'.join(str(promotion) for promotion in promotions))
|
||||||
|
|
||||||
|
|
||||||
def get_promos_by_name(store_id: int, load_xml: bool, promo_name: str):
|
def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool):
|
||||||
promotions = get_available_promos(store_id, load_xml)
|
"""
|
||||||
|
This function prints all promotions in a given chain and store_id containing a given promo_name.
|
||||||
|
|
||||||
|
:param store_id: A given store ID
|
||||||
|
:param chain: A given supermarket chain
|
||||||
|
:param promo_name: A given name of a promo (or part of it)
|
||||||
|
:param load_prices: A boolean representing whether to load an saved prices XML file or scrape a new one
|
||||||
|
:param load_promos: A boolean representing whether to load an saved XML file or scrape a new one
|
||||||
|
"""
|
||||||
|
promotions: List[Promotion] = get_available_promos(chain, store_id, load_prices, load_promos)
|
||||||
for promo in promotions:
|
for promo in promotions:
|
||||||
if promo_name in promo.content:
|
if promo_name in promo.content:
|
||||||
print(str(promo))
|
print(promo.repr_ltr())
|
||||||
|
33
shufersal.py
Normal file
33
shufersal.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
from typing import Dict, List
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from bs4.element import Tag
|
||||||
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
|
|
||||||
|
class ShuferSal(SupermarketChain):
|
||||||
|
promotion_tag_name = 'Promotion'
|
||||||
|
promotion_update_tag_name = 'PromotionUpdateDate'
|
||||||
|
date_format = '%Y-%m-%d'
|
||||||
|
date_hour_format = '%Y-%m-%d %H:%M'
|
||||||
|
item_tag_name = 'Item'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
||||||
|
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
|
||||||
|
if SupermarketChain.is_valid_store_id(store_id):
|
||||||
|
url += f"&storeId={store_id}"
|
||||||
|
req_res: requests.Response = requests.get(url)
|
||||||
|
soup = BeautifulSoup(req_res.text, features='lxml')
|
||||||
|
return soup.find('a', text="לחץ להורדה")['href']
|
||||||
|
|
||||||
|
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
|
||||||
|
All, Prices, PricesFull, Promos, PromosFull, Stores = range(6)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'Shufersal'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
||||||
|
return [items_dict.get(item.find('ItemCode').text) for item in promo.find_all('Item')
|
||||||
|
if items_dict.get(item.find('ItemCode').text)]
|
30
store.py
30
store.py
@@ -1,30 +0,0 @@
|
|||||||
from argparse import ArgumentTypeError
|
|
||||||
|
|
||||||
from utils import (
|
|
||||||
ShufersalCategories,
|
|
||||||
is_valid_store_id,
|
|
||||||
xml_file_gen,
|
|
||||||
create_bs_object,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def store_id_type(store_id: str):
|
|
||||||
if not is_valid_store_id(int(store_id)):
|
|
||||||
raise ArgumentTypeError(f"Given store_id: {store_id} is not a valid store_id.")
|
|
||||||
return store_id
|
|
||||||
|
|
||||||
|
|
||||||
def get_store_id(city: str, load_xml: bool):
|
|
||||||
"""
|
|
||||||
This function prints the store_ids of Shufersal stores in a given city.
|
|
||||||
The city must match exactly to its spelling in Shufersal's website (hence it should be in Hebrew alphabet).
|
|
||||||
|
|
||||||
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
|
|
||||||
:param city: A string representing the city of the requested store.
|
|
||||||
"""
|
|
||||||
xml_path = xml_file_gen(ShufersalCategories.Stores.name, -1)
|
|
||||||
bs_stores = create_bs_object(xml_path, ShufersalCategories.Stores.value, -1, load_xml)
|
|
||||||
|
|
||||||
for store in bs_stores.find_all("STORE"):
|
|
||||||
if store.find("CITY").text == city:
|
|
||||||
print((store.find("ADDRESS").text[::-1], store.find("STOREID").text, store.find("SUBCHAINNAME").text[::-1]))
|
|
20
store_utils.py
Normal file
20
store_utils.py
Normal file
@@ -0,0 +1,20 @@
|
|||||||
|
from utils import xml_file_gen, create_bs_object
|
||||||
|
from supermarket_chain import SupermarketChain
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def get_store_id(city: str, load_xml: bool, chain: SupermarketChain):
|
||||||
|
"""
|
||||||
|
This function prints the store_ids of stores in a given city.
|
||||||
|
The city must match exactly to its spelling in Shufersal's website (hence it should be in Hebrew alphabet).
|
||||||
|
|
||||||
|
:param chain: A given supermarket chain
|
||||||
|
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
|
||||||
|
:param city: A string representing the city of the requested store.
|
||||||
|
"""
|
||||||
|
xml_path: str = xml_file_gen(chain, -1, chain.XMLFilesCategory.Stores.name)
|
||||||
|
bs_stores: BeautifulSoup = create_bs_object(xml_path, chain, -1, load_xml, chain.XMLFilesCategory.Stores)
|
||||||
|
|
||||||
|
for store in bs_stores.find_all("STORE"):
|
||||||
|
if store.find("CITY").text == city:
|
||||||
|
print((store.find("ADDRESS").text, store.find("STOREID").text, store.find("SUBCHAINNAME").text))
|
80
supermarket_chain.py
Normal file
80
supermarket_chain.py
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
from abc import abstractmethod
|
||||||
|
from enum import Enum
|
||||||
|
from argparse import ArgumentTypeError
|
||||||
|
from typing import Dict, List
|
||||||
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
|
||||||
|
class SupermarketChain:
|
||||||
|
"""
|
||||||
|
A class representing a supermarket chain.
|
||||||
|
"""
|
||||||
|
class XMLFilesCategory(Enum):
|
||||||
|
"""
|
||||||
|
An enum class of different XML files produced by a supermarket chain
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def promotion_tag_name(self): pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def promotion_update_tag_name(self): pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def date_format(self): pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def date_hour_format(self): pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def item_tag_name(self): pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_valid_store_id(store_id: int) -> bool:
|
||||||
|
"""
|
||||||
|
This method returns whether a given store ID is valid (French Natural number).
|
||||||
|
|
||||||
|
:param store_id: A given store ID
|
||||||
|
"""
|
||||||
|
return isinstance(store_id, int) and store_id >= 0
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def store_id_type(store_id: str) -> str:
|
||||||
|
"""
|
||||||
|
This method used as a type verification for store_id.
|
||||||
|
|
||||||
|
:param store_id: A given store ID
|
||||||
|
:return: The given store_id if valid, else raise an ArgumentTypeError.
|
||||||
|
"""
|
||||||
|
if not SupermarketChain.is_valid_store_id(int(store_id)):
|
||||||
|
raise ArgumentTypeError(f"Given store_id: {store_id} is not a valid store_id.")
|
||||||
|
return store_id
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@abstractmethod
|
||||||
|
def get_download_url(store_id: int, category: XMLFilesCategory) -> str:
|
||||||
|
"""
|
||||||
|
This method scrapes supermarket's website and returns a url containing the data for a given store and category.
|
||||||
|
|
||||||
|
:param store_id: A given id of a store
|
||||||
|
:param category: A given category
|
||||||
|
:return: A downloadable link of the data for a given store and category
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
@abstractmethod
|
||||||
|
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
This method returns a list of the items that participate in a given promo
|
||||||
|
|
||||||
|
:param promo: A given promo
|
||||||
|
:param items_dict: A given dictionary of products
|
||||||
|
"""
|
||||||
|
pass
|
97
utils.py
97
utils.py
@@ -1,78 +1,63 @@
|
|||||||
import gzip
|
import gzip
|
||||||
from enum import Enum
|
from typing import AnyStr, Dict
|
||||||
from typing import Dict
|
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from pathlib import Path
|
from bs4.element import Tag
|
||||||
from os import path
|
from os import path
|
||||||
|
from supermarket_chain import SupermarketChain
|
||||||
|
import re
|
||||||
|
|
||||||
|
LOGS_DIRNAME = "logs"
|
||||||
XMLS_DIRNAME = "xmls"
|
XMLS_DIRNAME = "xmls"
|
||||||
Path(XMLS_DIRNAME).mkdir(exist_ok=True)
|
|
||||||
|
|
||||||
|
|
||||||
class ShufersalCategories(Enum):
|
def xml_file_gen(chain: SupermarketChain, store_id: int, category_name: str) -> str:
|
||||||
All, Prices, PricesFull, Promos, PromosFull, Stores = range(6)
|
|
||||||
|
|
||||||
|
|
||||||
def xml_file_gen(category_name: str, store_id: int) -> str:
|
|
||||||
"""
|
"""
|
||||||
This function generate an xml filename given a store id and a category_name
|
This function generate an XML filename given a store id and a category_name
|
||||||
If the given store_id is invalid, it is ignored in the returned string.
|
If the given store_id is invalid, it is ignored in the returned string.
|
||||||
|
|
||||||
|
:param chain: A given supermarket chain
|
||||||
:param store_id: A given store_id
|
:param store_id: A given store_id
|
||||||
:param category_name: A given category name
|
:param category_name: A given category name
|
||||||
:return: An xml filename
|
:return: An xml filename
|
||||||
"""
|
"""
|
||||||
store_id_str = f"-{str(store_id)}" if is_valid_store_id(store_id) else ""
|
store_id_str: str = f"-{str(store_id)}" if SupermarketChain.is_valid_store_id(store_id) else ""
|
||||||
return path.join(XMLS_DIRNAME, f"{category_name}{store_id_str}.xml")
|
return path.join(XMLS_DIRNAME, f"{chain}-{category_name}{store_id_str}.xml")
|
||||||
|
|
||||||
|
|
||||||
def get_download_url(store_id: int, cat_id: int) -> str:
|
def create_bs_object(xml_path: str, chain: SupermarketChain, store_id: int, load_xml: bool,
|
||||||
"""
|
category: SupermarketChain.XMLFilesCategory) -> BeautifulSoup:
|
||||||
This function scrapes Shufersal's website and returns a url that contains the data for a given store and category.
|
|
||||||
For info about the categories, see ShufersalCategories.
|
|
||||||
|
|
||||||
:param store_id: A given id of a store
|
|
||||||
:param cat_id: A given id of a category
|
|
||||||
:return: A downloadable link of the data for a given store and category
|
|
||||||
"""
|
|
||||||
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={cat_id}"
|
|
||||||
if is_valid_store_id(store_id):
|
|
||||||
url += f"&storeId={store_id}"
|
|
||||||
req_res = requests.get(url)
|
|
||||||
soup = BeautifulSoup(req_res.text, features='lxml')
|
|
||||||
return soup.find('a', text="לחץ להורדה")['href']
|
|
||||||
|
|
||||||
|
|
||||||
def create_bs_object(xml_path: str, cat_id: int, store_id: int, load_xml: bool) -> BeautifulSoup:
|
|
||||||
"""
|
"""
|
||||||
This function creates a BeautifulSoup (BS) object according to the given parameters.
|
This function creates a BeautifulSoup (BS) object according to the given parameters.
|
||||||
In case the given load_xml is True and the XML file exists, the function creates the BS object from the given
|
In case the given load_xml is True and the XML file exists, the function creates the BS object from the given
|
||||||
xml_path, otherwise it uses Shufersal's APIs to download the xml with the relevant content and saves it for
|
xml_path, otherwise it uses Shufersal's APIs to download the xml with the relevant content and saves it for
|
||||||
future use.
|
future use.
|
||||||
|
|
||||||
|
:param chain: A given supermarket chain
|
||||||
:param xml_path: A given path to an xml file to load/save the BS object from/to.
|
:param xml_path: A given path to an xml file to load/save the BS object from/to.
|
||||||
:param cat_id: A given id of a category from ShufersalCategories
|
:param category: A given category
|
||||||
:param store_id: A given id of a store
|
:param store_id: A given id of a store
|
||||||
:param load_xml: A flag representing whether to try loading an existing XML file
|
:param load_xml: A flag representing whether to try loading an existing XML file
|
||||||
:return: A BeautifulSoup object with xml content.
|
:return: A BeautifulSoup object with xml content.
|
||||||
"""
|
"""
|
||||||
if load_xml and path.isfile(xml_path):
|
if load_xml and path.isfile(xml_path):
|
||||||
return create_bs_object_from_xml(xml_path)
|
return create_bs_object_from_xml(xml_path)
|
||||||
return create_bs_object_from_link(xml_path, store_id, cat_id)
|
return create_bs_object_from_link(xml_path, chain, category, store_id)
|
||||||
|
|
||||||
|
|
||||||
def create_bs_object_from_link(xml_path: str, store_id: int, cat_id: int) -> BeautifulSoup:
|
def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category: SupermarketChain.XMLFilesCategory,
|
||||||
|
store_id: int) -> BeautifulSoup:
|
||||||
"""
|
"""
|
||||||
This function creates a BeautifulSoup (BS) object by generating a download link from Shufersal's API.
|
This function creates a BeautifulSoup (BS) object by generating a download link from Shufersal's API.
|
||||||
|
|
||||||
:param xml_path: A given path to an xml file to load/save the BS object from/to.
|
:param chain: A given supermarket chain
|
||||||
|
:param xml_path: A given path to an XML file to load/save the BS object from/to.
|
||||||
:param store_id: A given id of a store
|
:param store_id: A given id of a store
|
||||||
:param cat_id: A given id of a category from ShufersalCategories
|
:param category: A given category
|
||||||
:return: A BeautifulSoup object with xml content.
|
:return: A BeautifulSoup object with xml content.
|
||||||
"""
|
"""
|
||||||
download_url = get_download_url(store_id, cat_id)
|
download_url: str = chain.get_download_url(store_id, category)
|
||||||
xml_content = gzip.decompress(requests.get(download_url).content)
|
xml_content: AnyStr = gzip.decompress(requests.get(download_url).content)
|
||||||
with open(xml_path, 'wb') as f_out:
|
with open(xml_path, 'wb') as f_out:
|
||||||
f_out.write(xml_content)
|
f_out.write(xml_content)
|
||||||
return BeautifulSoup(xml_content, features='xml')
|
return BeautifulSoup(xml_content, features='xml')
|
||||||
@@ -80,7 +65,7 @@ def create_bs_object_from_link(xml_path: str, store_id: int, cat_id: int) -> Bea
|
|||||||
|
|
||||||
def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
|
def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
|
||||||
"""
|
"""
|
||||||
This function creates a BeautifulSoup (BS) object from a given xml file.
|
This function creates a BeautifulSoup (BS) object from a given XML file.
|
||||||
|
|
||||||
:param xml_path: A given path to an xml file to load/save the BS object from/to.
|
:param xml_path: A given path to an xml file to load/save the BS object from/to.
|
||||||
:return: A BeautifulSoup object with xml content.
|
:return: A BeautifulSoup object with xml content.
|
||||||
@@ -89,39 +74,41 @@ def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
|
|||||||
return BeautifulSoup(f_in, features='xml')
|
return BeautifulSoup(f_in, features='xml')
|
||||||
|
|
||||||
|
|
||||||
def create_items_dict(store_id: int, load_xml) -> Dict:
|
def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, str]:
|
||||||
"""
|
"""
|
||||||
This function creates a dictionary where every key is an item code and its value is the item's name and price.
|
This function creates a dictionary where every key is an item code and its value is the item's name and price.
|
||||||
|
|
||||||
|
:param chain: A given supermarket chain
|
||||||
:param load_xml: A boolean representing whether to load an existing prices xml file
|
:param load_xml: A boolean representing whether to load an existing prices xml file
|
||||||
:param store_id: A given store id
|
:param store_id: A given store id
|
||||||
:return: A dictionary where the firs
|
:return: A dictionary where the firs
|
||||||
"""
|
"""
|
||||||
xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id)
|
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
|
||||||
bs_prices = create_bs_object(xml_path, ShufersalCategories.PricesFull.value, store_id, load_xml)
|
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
|
||||||
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all('Item')}
|
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
|
||||||
|
|
||||||
|
|
||||||
def get_item_info(item):
|
def get_item_info(item: Tag) -> str:
|
||||||
return str((item.find('ItemName').text, item.find('ManufacturerName').text, item.find('ItemPrice').text))
|
|
||||||
|
|
||||||
|
|
||||||
def get_products_prices(store_id: int, product_name: str, load_xml: bool):
|
|
||||||
"""
|
"""
|
||||||
This function prints the products in a given Shufersal store which contains a given product_name.
|
This function returns a string containing important information about a given supermarket's product.
|
||||||
|
"""
|
||||||
|
return [item.find('ItemName').text, item.find(re.compile(r'Manufacture[r]?Name')).text,
|
||||||
|
item.find('ItemPrice').text, item.find('ItemCode').text]
|
||||||
|
|
||||||
:param store_id: A given Shufersal store id
|
|
||||||
|
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
|
||||||
|
"""
|
||||||
|
This function prints the products in a given store which contains a given product_name.
|
||||||
|
|
||||||
|
:param chain: A given supermarket chain
|
||||||
|
:param store_id: A given store id
|
||||||
:param product_name: A given product name
|
:param product_name: A given product name
|
||||||
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
|
:param load_xml: A boolean representing whether to load an existing xml or load an already saved one
|
||||||
"""
|
"""
|
||||||
xml_path = xml_file_gen(ShufersalCategories.PricesFull.name, store_id)
|
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
|
||||||
bs_prices = create_bs_object(xml_path, ShufersalCategories.PricesFull.value, store_id, load_xml)
|
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
|
||||||
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
|
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
|
||||||
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
|
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
|
||||||
for prod in prods:
|
for prod in prods:
|
||||||
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
|
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
|
||||||
prod.find('ItemPrice').text))
|
prod.find('ItemPrice').text))
|
||||||
|
|
||||||
|
|
||||||
def is_valid_store_id(store_id: int):
|
|
||||||
return isinstance(store_id, int) and store_id >= 0
|
|
||||||
|
Reference in New Issue
Block a user