has added a CSV format promotions file when running --promos. Item.py was added for moduling.

This commit is contained in:
KorenLazar
2021-01-28 16:25:38 +02:00
parent 47c0d04ce4
commit d7e5b709f8
8 changed files with 94 additions and 29 deletions

View File

@@ -1,5 +1,5 @@
# Supermarket basic scraping # Supermarket basic scraping
The library supports scraping from Shufersal, Co-Op and Zol Vebegadol The library supports scraping from Shufersal, Co-Op and Zol Vebegadol.
## Installation ## Installation
clone: clone:
@@ -24,15 +24,14 @@ python main.py --find_store ירושלים --chain Shufersal
In case you want a different supermarket chain, just change 'Shufersal' to a different name (the options will be In case you want a different supermarket chain, just change 'Shufersal' to a different name (the options will be
printed in case of misspelling). printed in case of misspelling).
After running the command, you'll be able to see the different stores in Jerusalem with their IDs in "results\Shufersal-Stores.xml". The output of the last command - the different Shufersal stores in Jerusalem with their IDs - should be printed.
Now, that we have the store's ID, we can get the store's relevant promotions sorted by their start date, last update Now, that we have the store's ID, we can get the store's relevant promotions sorted by their start date, last update and length.
, and length.
```cmd script ```cmd script
python main.py --promos 5 --chain Shufersal python main.py --promos 5 --chain Shufersal
``` ```
* We assumed that the store's ID is 5. * We assumed that the store's ID is 5.
Now, you can find the promos in "results\Shufersal_promos_5.log". Now, you can find the promos in both "results\Shufersal_promos_5.csv" and "results\Shufersal_promos_5.log".
For other documentation and commands, you can run For other documentation and commands, you can run
```cmd script ```cmd script

View File

@@ -2,6 +2,8 @@ from typing import Dict, List
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from item import Item
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
@@ -31,6 +33,6 @@ class CoOp(SupermarketChain):
return 'Co-Op' return 'Co-Op'
@staticmethod @staticmethod
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
promo_item = items_dict.get(promo.find('ItemCode').text) promo_item = items_dict.get(promo.find('ItemCode').text)
return [promo_item] if promo_item else [] return [promo_item] if promo_item else []

13
item.py Normal file
View File

@@ -0,0 +1,13 @@
class Item:
"""
A class representing a product in some supermarket.
"""
def __init__(self, name: str, price: float, manufacturer: str, code: int):
self.name: str = name
self.price: float = price
self.manufacturer: str = manufacturer
self.code: int = code
def __repr__(self):
return str((self.name, self.price, self.manufacturer, self.code))

View File

@@ -1,6 +1,8 @@
from datetime import datetime from datetime import datetime
from typing import Dict, List from typing import Dict, List
import csv
from item import Item
from utils import ( from utils import (
create_items_dict, create_items_dict,
xml_file_gen, xml_file_gen,
@@ -17,12 +19,13 @@ class Promotion:
It contains only part of the available information in Shufersal's data. It contains only part of the available information in Shufersal's data.
""" """
def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime, items: List[str]): def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime,
items: List[Item]):
self.content: str = content self.content: str = content
self.start_date = start_date self.start_date: datetime = start_date
self.end_date: datetime = end_date self.end_date: datetime = end_date
self.update_date: datetime = update_date self.update_date: datetime = update_date
self.items: List[str] = items self.items: List[Item] = items
def __repr__(self): def __repr__(self):
title = self.content title = self.content
@@ -42,6 +45,40 @@ class Promotion:
return self.content == other.content and self.start_date == other.start_date and self.end_date == other.end_date return self.content == other.content and self.start_date == other.start_date and self.end_date == other.end_date
def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) -> None:
"""
This function writes a given list of promotions to a given output file in a CSV format.
:param promotions: A given list of promotions
:param output_filename: A given file to write to
"""
with open(output_filename, mode='w', newline='') as f_out:
promos_writer = csv.writer(f_out)
promos_writer.writerow([
'תיאור המבצע',
'הפריט המשתתף במבצע',
'מחיר לפני המבצע',
'זמן תחילת המבצע',
'זמן סיום המבצע',
'זמן עדכון אחרון',
'יצרן',
'ברקוד של הפריט'
])
for promo in promotions:
promos_writer.writerows(
[[promo.content,
item.name,
item.price,
promo.start_date,
promo.end_date,
promo.update_date,
item.manufacturer,
item.code]
for item in promo.items]
)
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]: def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]:
""" """
This function return the available promotions given a BeautifulSoup object. This function return the available promotions given a BeautifulSoup object.
@@ -52,7 +89,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
:param load_prices: A boolean representing whether to load an existing xml or load an already saved one :param load_prices: A boolean representing whether to load an existing xml or load an already saved one
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available :return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
""" """
items_dict: Dict[str, str] = create_items_dict(chain, load_prices, store_id) items_dict: Dict[str, Item] = create_items_dict(chain, load_prices, store_id)
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull) bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull)
@@ -78,10 +115,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
def is_valid_promo(promo: Promotion): def is_valid_promo(promo: Promotion):
""" """
This function checks if a given promo object is valid. This function returns whether a given Promotion object is currently valid.
:param promo: A given promotion
:return: True iff the given Promotion is valid.
""" """
today_date: datetime = datetime.now() today_date: datetime = datetime.now()
not_expired: bool = promo.end_date >= today_date not_expired: bool = promo.end_date >= today_date
@@ -93,7 +127,7 @@ def is_valid_promo(promo: Promotion):
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain): def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
""" """
This function logs the available promos in a store with a given id sorted by their update date. This function logs the available promotions in a store with a given id sorted by their update date.
:param chain: The name of the requested supermarket chain :param chain: The name of the requested supermarket chain
:param store_id: A given store id :param store_id: A given store id
@@ -105,6 +139,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date - promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
promo.end_date), reverse=True) promo.end_date), reverse=True)
logger.info('\n'.join(str(promotion) for promotion in promotions)) logger.info('\n'.join(str(promotion) for promotion in promotions))
write_promotions_to_csv(promotions, f'results/{chain}_promos_{store_id}.csv')
def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool): def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool):
@@ -124,7 +159,10 @@ def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str,
def get_all_null_items_in_promos(chain, store_id): def get_all_null_items_in_promos(chain, store_id):
items_dict: Dict[str, str] = create_items_dict(chain, True, store_id) """
This function finds all items appearing in the chain's promotions file but not in the chain's prices file.
"""
items_dict: Dict[str, Item] = create_items_dict(chain, True, store_id)
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
bs_promos = create_bs_object(xml_path, chain, store_id, True, chain.XMLFilesCategory.PromosFull) bs_promos = create_bs_object(xml_path, chain, store_id, True, chain.XMLFilesCategory.PromosFull)

View File

@@ -2,6 +2,8 @@ from typing import Dict, List
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from item import Item
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
@@ -30,7 +32,7 @@ class ShuferSal(SupermarketChain):
return 'Shufersal' return 'Shufersal'
@staticmethod @staticmethod
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
items = list() items = list()
for item in promo.find_all('Item'): for item in promo.find_all('Item'):
item_code = item.find('ItemCode').text item_code = item.find('ItemCode').text

View File

@@ -4,6 +4,8 @@ from argparse import ArgumentTypeError
from typing import Dict, List from typing import Dict, List
from bs4.element import Tag from bs4.element import Tag
from item import Item
class SupermarketChain: class SupermarketChain:
""" """
@@ -72,17 +74,20 @@ class SupermarketChain:
@staticmethod @staticmethod
@abstractmethod @abstractmethod
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
""" """
This method returns a list of the items that participate in a given promo This method returns a list of the items that participate in a given promotion.
:param promo: A given promo :param promo: A given promotion
:param items_dict: A given dictionary of products :param items_dict: A given dictionary of products
""" """
pass pass
@staticmethod @staticmethod
def get_null_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: def get_null_items(promo: Tag, items_dict: Dict[str, Item]) -> List[str]:
"""
This function returns all the items in a given promotion which do not appear in the given items_dict.
"""
return [item.find('ItemCode').text for item in promo.find_all('Item') return [item.find('ItemCode').text for item in promo.find_all('Item')
if not items_dict.get(item.find('ItemCode').text)] if not items_dict.get(item.find('ItemCode').text)]

View File

@@ -1,11 +1,13 @@
import gzip import gzip
import io import io
import zipfile import zipfile
from typing import AnyStr, Dict, List from typing import AnyStr, Dict
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from bs4.element import Tag from bs4.element import Tag
from os import path from os import path
from item import Item
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
import re import re
@@ -83,26 +85,29 @@ def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
return BeautifulSoup(f_in, features='xml') return BeautifulSoup(f_in, features='xml')
def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, str]: def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, Item]:
""" """
This function creates a dictionary where every key is an item code and its value is the item's name and price. This function creates a dictionary where every key is an item code and its value is its corresponding Item instance.
:param chain: A given supermarket chain :param chain: A given supermarket chain
:param load_xml: A boolean representing whether to load an existing prices xml file :param load_xml: A boolean representing whether to load an existing prices xml file
:param store_id: A given store id :param store_id: A given store id
:return: A dictionary where the firs
""" """
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name) xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull) bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)} return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
def get_item_info(item: Tag) -> List[str]: def get_item_info(item: Tag) -> Item:
""" """
This function returns a string containing important information about a given supermarket's product. This function returns a string containing important information about a given supermarket's product.
""" """
return [item.find(re.compile(r'ItemN[a]?m[e]?')).text, item.find(re.compile(r'Manufacture[r]?Name')).text, return Item(
item.find('ItemPrice').text, item.find('ItemCode').text] name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
price=item.find('ItemPrice').text,
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
code=item.find('ItemCode').text
)
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None: def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:

View File

@@ -3,6 +3,7 @@ from typing import Dict, List
import requests import requests
from bs4.element import Tag from bs4.element import Tag
from item import Item
from supermarket_chain import SupermarketChain from supermarket_chain import SupermarketChain
@@ -32,7 +33,7 @@ class ZolVebegadol(SupermarketChain):
return down_url return down_url
@staticmethod @staticmethod
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]: def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
items = list() items = list()
for item in promo.find_all('Item'): for item in promo.find_all('Item'):
item_code = item.find('ItemCode').text item_code = item.find('ItemCode').text