has added a CSV format promotions file when running --promos. Item.py was added for moduling.
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
# Supermarket basic scraping
|
# Supermarket basic scraping
|
||||||
The library supports scraping from Shufersal, Co-Op and Zol Vebegadol
|
The library supports scraping from Shufersal, Co-Op and Zol Vebegadol.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
clone:
|
clone:
|
||||||
@@ -24,15 +24,14 @@ python main.py --find_store ירושלים --chain Shufersal
|
|||||||
In case you want a different supermarket chain, just change 'Shufersal' to a different name (the options will be
|
In case you want a different supermarket chain, just change 'Shufersal' to a different name (the options will be
|
||||||
printed in case of misspelling).
|
printed in case of misspelling).
|
||||||
|
|
||||||
After running the command, you'll be able to see the different stores in Jerusalem with their IDs in "results\Shufersal-Stores.xml".
|
The output of the last command - the different Shufersal stores in Jerusalem with their IDs - should be printed.
|
||||||
|
|
||||||
Now, that we have the store's ID, we can get the store's relevant promotions sorted by their start date, last update
|
Now, that we have the store's ID, we can get the store's relevant promotions sorted by their start date, last update and length.
|
||||||
, and length.
|
|
||||||
```cmd script
|
```cmd script
|
||||||
python main.py --promos 5 --chain Shufersal
|
python main.py --promos 5 --chain Shufersal
|
||||||
```
|
```
|
||||||
* We assumed that the store's ID is 5.
|
* We assumed that the store's ID is 5.
|
||||||
Now, you can find the promos in "results\Shufersal_promos_5.log".
|
Now, you can find the promos in both "results\Shufersal_promos_5.csv" and "results\Shufersal_promos_5.log".
|
||||||
|
|
||||||
For other documentation and commands, you can run
|
For other documentation and commands, you can run
|
||||||
```cmd script
|
```cmd script
|
||||||
|
4
co_op.py
4
co_op.py
@@ -2,6 +2,8 @@ from typing import Dict, List
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
from item import Item
|
||||||
from supermarket_chain import SupermarketChain
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
|
|
||||||
@@ -31,6 +33,6 @@ class CoOp(SupermarketChain):
|
|||||||
return 'Co-Op'
|
return 'Co-Op'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||||
promo_item = items_dict.get(promo.find('ItemCode').text)
|
promo_item = items_dict.get(promo.find('ItemCode').text)
|
||||||
return [promo_item] if promo_item else []
|
return [promo_item] if promo_item else []
|
||||||
|
13
item.py
Normal file
13
item.py
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
class Item:
|
||||||
|
"""
|
||||||
|
A class representing a product in some supermarket.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, name: str, price: float, manufacturer: str, code: int):
|
||||||
|
self.name: str = name
|
||||||
|
self.price: float = price
|
||||||
|
self.manufacturer: str = manufacturer
|
||||||
|
self.code: int = code
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return str((self.name, self.price, self.manufacturer, self.code))
|
58
promotion.py
58
promotion.py
@@ -1,6 +1,8 @@
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
import csv
|
||||||
|
|
||||||
|
from item import Item
|
||||||
from utils import (
|
from utils import (
|
||||||
create_items_dict,
|
create_items_dict,
|
||||||
xml_file_gen,
|
xml_file_gen,
|
||||||
@@ -17,12 +19,13 @@ class Promotion:
|
|||||||
It contains only part of the available information in Shufersal's data.
|
It contains only part of the available information in Shufersal's data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime, items: List[str]):
|
def __init__(self, content: str, start_date: datetime, end_date: datetime, update_date: datetime,
|
||||||
|
items: List[Item]):
|
||||||
self.content: str = content
|
self.content: str = content
|
||||||
self.start_date = start_date
|
self.start_date: datetime = start_date
|
||||||
self.end_date: datetime = end_date
|
self.end_date: datetime = end_date
|
||||||
self.update_date: datetime = update_date
|
self.update_date: datetime = update_date
|
||||||
self.items: List[str] = items
|
self.items: List[Item] = items
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
title = self.content
|
title = self.content
|
||||||
@@ -42,6 +45,40 @@ class Promotion:
|
|||||||
return self.content == other.content and self.start_date == other.start_date and self.end_date == other.end_date
|
return self.content == other.content and self.start_date == other.start_date and self.end_date == other.end_date
|
||||||
|
|
||||||
|
|
||||||
|
def write_promotions_to_csv(promotions: List[Promotion], output_filename: str) -> None:
|
||||||
|
"""
|
||||||
|
This function writes a given list of promotions to a given output file in a CSV format.
|
||||||
|
|
||||||
|
:param promotions: A given list of promotions
|
||||||
|
:param output_filename: A given file to write to
|
||||||
|
"""
|
||||||
|
with open(output_filename, mode='w', newline='') as f_out:
|
||||||
|
promos_writer = csv.writer(f_out)
|
||||||
|
promos_writer.writerow([
|
||||||
|
'תיאור המבצע',
|
||||||
|
'הפריט המשתתף במבצע',
|
||||||
|
'מחיר לפני המבצע',
|
||||||
|
'זמן תחילת המבצע',
|
||||||
|
'זמן סיום המבצע',
|
||||||
|
'זמן עדכון אחרון',
|
||||||
|
'יצרן',
|
||||||
|
'ברקוד של הפריט'
|
||||||
|
])
|
||||||
|
|
||||||
|
for promo in promotions:
|
||||||
|
promos_writer.writerows(
|
||||||
|
[[promo.content,
|
||||||
|
item.name,
|
||||||
|
item.price,
|
||||||
|
promo.start_date,
|
||||||
|
promo.end_date,
|
||||||
|
promo.update_date,
|
||||||
|
item.manufacturer,
|
||||||
|
item.code]
|
||||||
|
for item in promo.items]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]:
|
def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bool, load_promos) -> List[Promotion]:
|
||||||
"""
|
"""
|
||||||
This function return the available promotions given a BeautifulSoup object.
|
This function return the available promotions given a BeautifulSoup object.
|
||||||
@@ -52,7 +89,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
|
|||||||
:param load_prices: A boolean representing whether to load an existing xml or load an already saved one
|
:param load_prices: A boolean representing whether to load an existing xml or load an already saved one
|
||||||
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
|
:return: Promotions that are not included in PRODUCTS_TO_IGNORE and are currently available
|
||||||
"""
|
"""
|
||||||
items_dict: Dict[str, str] = create_items_dict(chain, load_prices, store_id)
|
items_dict: Dict[str, Item] = create_items_dict(chain, load_prices, store_id)
|
||||||
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
|
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
|
||||||
bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull)
|
bs_promos = create_bs_object(xml_path, chain, store_id, load_promos, chain.XMLFilesCategory.PromosFull)
|
||||||
|
|
||||||
@@ -78,10 +115,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
|
|||||||
|
|
||||||
def is_valid_promo(promo: Promotion):
|
def is_valid_promo(promo: Promotion):
|
||||||
"""
|
"""
|
||||||
This function checks if a given promo object is valid.
|
This function returns whether a given Promotion object is currently valid.
|
||||||
|
|
||||||
:param promo: A given promotion
|
|
||||||
:return: True iff the given Promotion is valid.
|
|
||||||
"""
|
"""
|
||||||
today_date: datetime = datetime.now()
|
today_date: datetime = datetime.now()
|
||||||
not_expired: bool = promo.end_date >= today_date
|
not_expired: bool = promo.end_date >= today_date
|
||||||
@@ -93,7 +127,7 @@ def is_valid_promo(promo: Promotion):
|
|||||||
|
|
||||||
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
|
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
|
||||||
"""
|
"""
|
||||||
This function logs the available promos in a store with a given id sorted by their update date.
|
This function logs the available promotions in a store with a given id sorted by their update date.
|
||||||
|
|
||||||
:param chain: The name of the requested supermarket chain
|
:param chain: The name of the requested supermarket chain
|
||||||
:param store_id: A given store id
|
:param store_id: A given store id
|
||||||
@@ -105,6 +139,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket
|
|||||||
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
|
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
|
||||||
promo.end_date), reverse=True)
|
promo.end_date), reverse=True)
|
||||||
logger.info('\n'.join(str(promotion) for promotion in promotions))
|
logger.info('\n'.join(str(promotion) for promotion in promotions))
|
||||||
|
write_promotions_to_csv(promotions, f'results/{chain}_promos_{store_id}.csv')
|
||||||
|
|
||||||
|
|
||||||
def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool):
|
def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str, load_prices: bool, load_promos: bool):
|
||||||
@@ -124,7 +159,10 @@ def get_promos_by_name(store_id: int, chain: SupermarketChain, promo_name: str,
|
|||||||
|
|
||||||
|
|
||||||
def get_all_null_items_in_promos(chain, store_id):
|
def get_all_null_items_in_promos(chain, store_id):
|
||||||
items_dict: Dict[str, str] = create_items_dict(chain, True, store_id)
|
"""
|
||||||
|
This function finds all items appearing in the chain's promotions file but not in the chain's prices file.
|
||||||
|
"""
|
||||||
|
items_dict: Dict[str, Item] = create_items_dict(chain, True, store_id)
|
||||||
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
|
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PromosFull.name)
|
||||||
bs_promos = create_bs_object(xml_path, chain, store_id, True, chain.XMLFilesCategory.PromosFull)
|
bs_promos = create_bs_object(xml_path, chain, store_id, True, chain.XMLFilesCategory.PromosFull)
|
||||||
|
|
||||||
|
@@ -2,6 +2,8 @@ from typing import Dict, List
|
|||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
from item import Item
|
||||||
from supermarket_chain import SupermarketChain
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
|
|
||||||
@@ -30,7 +32,7 @@ class ShuferSal(SupermarketChain):
|
|||||||
return 'Shufersal'
|
return 'Shufersal'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||||
items = list()
|
items = list()
|
||||||
for item in promo.find_all('Item'):
|
for item in promo.find_all('Item'):
|
||||||
item_code = item.find('ItemCode').text
|
item_code = item.find('ItemCode').text
|
||||||
|
@@ -4,6 +4,8 @@ from argparse import ArgumentTypeError
|
|||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
from item import Item
|
||||||
|
|
||||||
|
|
||||||
class SupermarketChain:
|
class SupermarketChain:
|
||||||
"""
|
"""
|
||||||
@@ -72,17 +74,20 @@ class SupermarketChain:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||||
"""
|
"""
|
||||||
This method returns a list of the items that participate in a given promo
|
This method returns a list of the items that participate in a given promotion.
|
||||||
|
|
||||||
:param promo: A given promo
|
:param promo: A given promotion
|
||||||
:param items_dict: A given dictionary of products
|
:param items_dict: A given dictionary of products
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_null_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
def get_null_items(promo: Tag, items_dict: Dict[str, Item]) -> List[str]:
|
||||||
|
"""
|
||||||
|
This function returns all the items in a given promotion which do not appear in the given items_dict.
|
||||||
|
"""
|
||||||
return [item.find('ItemCode').text for item in promo.find_all('Item')
|
return [item.find('ItemCode').text for item in promo.find_all('Item')
|
||||||
if not items_dict.get(item.find('ItemCode').text)]
|
if not items_dict.get(item.find('ItemCode').text)]
|
||||||
|
|
||||||
|
19
utils.py
19
utils.py
@@ -1,11 +1,13 @@
|
|||||||
import gzip
|
import gzip
|
||||||
import io
|
import io
|
||||||
import zipfile
|
import zipfile
|
||||||
from typing import AnyStr, Dict, List
|
from typing import AnyStr, Dict
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
|
from item import Item
|
||||||
from supermarket_chain import SupermarketChain
|
from supermarket_chain import SupermarketChain
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -83,26 +85,29 @@ def create_bs_object_from_xml(xml_path: str) -> BeautifulSoup:
|
|||||||
return BeautifulSoup(f_in, features='xml')
|
return BeautifulSoup(f_in, features='xml')
|
||||||
|
|
||||||
|
|
||||||
def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, str]:
|
def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[str, Item]:
|
||||||
"""
|
"""
|
||||||
This function creates a dictionary where every key is an item code and its value is the item's name and price.
|
This function creates a dictionary where every key is an item code and its value is its corresponding Item instance.
|
||||||
|
|
||||||
:param chain: A given supermarket chain
|
:param chain: A given supermarket chain
|
||||||
:param load_xml: A boolean representing whether to load an existing prices xml file
|
:param load_xml: A boolean representing whether to load an existing prices xml file
|
||||||
:param store_id: A given store id
|
:param store_id: A given store id
|
||||||
:return: A dictionary where the firs
|
|
||||||
"""
|
"""
|
||||||
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
|
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
|
||||||
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
|
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
|
||||||
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
|
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
|
||||||
|
|
||||||
|
|
||||||
def get_item_info(item: Tag) -> List[str]:
|
def get_item_info(item: Tag) -> Item:
|
||||||
"""
|
"""
|
||||||
This function returns a string containing important information about a given supermarket's product.
|
This function returns a string containing important information about a given supermarket's product.
|
||||||
"""
|
"""
|
||||||
return [item.find(re.compile(r'ItemN[a]?m[e]?')).text, item.find(re.compile(r'Manufacture[r]?Name')).text,
|
return Item(
|
||||||
item.find('ItemPrice').text, item.find('ItemCode').text]
|
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
|
||||||
|
price=item.find('ItemPrice').text,
|
||||||
|
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
|
||||||
|
code=item.find('ItemCode').text
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
|
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
|
||||||
|
@@ -3,6 +3,7 @@ from typing import Dict, List
|
|||||||
import requests
|
import requests
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
from item import Item
|
||||||
from supermarket_chain import SupermarketChain
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
|
|
||||||
@@ -32,7 +33,7 @@ class ZolVebegadol(SupermarketChain):
|
|||||||
return down_url
|
return down_url
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_items(promo: Tag, items_dict: Dict[str, str]) -> List[str]:
|
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||||
items = list()
|
items = list()
|
||||||
for item in promo.find_all('Item'):
|
for item in promo.find_all('Item'):
|
||||||
item_code = item.find('ItemCode').text
|
item_code = item.find('ItemCode').text
|
||||||
|
Reference in New Issue
Block a user