Files
supermarket-scraping/promotion.py
2021-02-06 21:42:31 +02:00

175 lines
7.3 KiB
Python

from datetime import datetime
from typing import Dict, List
import csv
from item import Item
from utils import (
create_items_dict,
xml_file_gen,
create_bs_object,
)
from supermarket_chain import SupermarketChain
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'מגבת', 'מפות', 'פסטיגל', 'ביגי']
class Promotion:
"""
A class of a promotion in Shufersal.
It contains only part of the available information in Shufersal's data.
"""
def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime,
items: List[Item]):
self.content: str = content
self.start_date: datetime = start_date
self.end_date: datetime = end_date
self.update_date: datetime = update_date
self.items: List[Item] = items
def __repr__(self):
title = self.content
dates_range = f"Between {self.start_date} and {self.end_date}"
update_line = f"Updated at {self.update_date}"
items = '\n'.join(str(item) for item in self.items)
return '\n'.join([title, dates_range, update_line, items]) + '\n'
def repr_ltr(self):
title = self.content
dates_range = f"Between {self.start_date} and {self.end_date}"
update_line = f"Updated at {self.update_date}"
items = '\n'.join(str(item) for item in self.items)
return '\n'.join([title, dates_range, update_line, items]) + '\n'
def __eq__(self, other):
return self.content == other.content and self.start_date == other.start_date and self.end_date == other.end_date
def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) -> None:
"""
This function writes a given list of promotions to a given output file in a CSV format.
:param promotions: A given list of promotions
:param output_filename: A given file to write to
"""
with open(output_filename, mode='w', newline='') as f_out:
promos_writer = csv.writer(f_out)
promos_writer.writerow([
'תיאור המבצע',
'הפריט המשתתף במבצע',
'מחיר לפני המבצע',
'זמן תחילת המבצע',
'זמן סיום המבצע',
'זמן עדכון אחרון',
'יצרן',
'ברקוד של הפריט'
])
for promo in promotions:
promos_writer.writerows(
[[promo.content,
item.name,
item.price,
promo.start_date,
promo.end_date,
promo.update_date,
item.manufacturer,
item.code]
for item in promo.items]
)
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]:
"""
This function return the available promotions given a BeautifulSoup object.
:param load_promos:
:param chain: The name of the requested supermarket chain
:param store_id: A given store id
:param load_prices: A boolean representing whether to load an existing xml or load an already saved one
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
"""
items_dict: Dict[str, Item] = create_items_dict(chain, load_prices, store_id)
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull)
promo_objs = list()
for promo in bs_promos.find_all(chain.promotion_tag_name):
promo = Promotion(
content=promo.find('PromotionDescription').text,
start_date=datetime.strptime(
promo.find('PromotionStartDate').text + ' ' + promo.find('PromotionStartHour').text,
chain.date_hour_format),
end_date=datetime.strptime(promo.find(
'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format),
items=chain.get_items(promo, items_dict),
)
if is_valid_promo(promo):
if promo_objs and promo_objs[-1] == promo: # Merge equal promos
promo_objs[-1].items.extend(promo.items)
else:
promo_objs.append(promo)
return promo_objs
def is_valid_promo(promo: Promotion):
"""
This function returns whether a given Promotion object is currently valid.
"""
today_date: datetime = datetime.now()
not_expired: bool = promo.end_date >= today_date
has_started: bool = promo.start_date <= today_date
has_products: bool = len(promo.items) > 0
in_promo_ignore_list: bool = any(product in promo.content for product in PRODUCTS_TO_IGNORE)
return not_expired and has_started and has_products and not in_promo_ignore_list
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain, load_promos: bool):
"""
This function logs the available promotions in a store with a given id sorted by their update date.
:param chain: The name of the requested supermarket chain
:param store_id: A given store id
:param load_xml: A boolean representing whether to load an existing prices xml file
:param load_promos: A boolean representing whether to load an existing promos xml file
:param logger: A given logger
"""
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos)
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
promo.end_date), reverse=True)
logger.info('\n'.join(str(promotion) for promotion in promotions))
write_promotions_to_csv(promotions, f'results/{repr(type(chain))}_promos_{store_id}.csv')
def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool):
"""
This function prints all promotions in a given chain and store_id containing a given promo_name.
:param store_id: A given store ID
:param chain: A given supermarket chain
:param promo_name: A given name of a promo (or part of it)
:param load_prices: A boolean representing whether to load an saved prices XML file or scrape a new one
:param load_promos: A boolean representing whether to load an saved XML file or scrape a new one
"""
promotions: List[Promotion] = get_available_promos(chain, store_id, load_prices, load_promos)
for promo in promotions:
if promo_name in promo.content:
print(promo.repr_ltr())
def get_all_null_items_in_promos(chain, store_id):
"""
This function finds all items appearing in the chain's promotions file but not in the chain's prices file.
"""
items_dict: Dict[str, Item] = create_items_dict(chain, True, store_id)
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
bs_promos = create_bs_object(xml_path, chain, store_id, True, chain.XMLFilesCategory.PromosFull)
null_items = list()
for promo in bs_promos.find_all(chain.promotion_tag_name):
null_items.extend(chain.get_null_items(promo, items_dict))
return null_items