has added RamiLevi to the chains collection
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
# Supermarket basic scraping
|
||||
The library supports scraping from Shufersal, Co-Op and Zol Vebegadol.
|
||||
The library supports scraping from Shufersal, CoOp and Zol Vebegadol.
|
||||
|
||||
## Installation
|
||||
clone:
|
||||
|
9
co_op.py
9
co_op.py
@@ -8,14 +8,19 @@ from supermarket_chain import SupermarketChain
|
||||
|
||||
|
||||
class CoOp(SupermarketChain):
|
||||
|
||||
promotion_tag_name = 'Sale'
|
||||
promotion_update_tag_name = 'PriceUpdateDate'
|
||||
date_format = '%Y/%m/%d'
|
||||
date_hour_format = '%Y/%m/%d %H:%M:%S'
|
||||
item_tag_name = 'Product'
|
||||
|
||||
@property
|
||||
def update_date_format(self):
|
||||
return CoOp.date_hour_format
|
||||
|
||||
@staticmethod
|
||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
|
||||
prefix = "http://matrixcatalog.co.il/"
|
||||
url = prefix + "NBCompetitionRegulations.aspx"
|
||||
req_res: requests.Response = requests.get(url)
|
||||
@@ -30,7 +35,7 @@ class CoOp(SupermarketChain):
|
||||
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
|
||||
|
||||
def __repr__(self):
|
||||
return 'Co-Op'
|
||||
return 'CoOp'
|
||||
|
||||
@staticmethod
|
||||
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||
|
9
main.py
9
main.py
@@ -7,6 +7,7 @@ from supermarket_chain import SupermarketChain
|
||||
from shufersal import ShuferSal
|
||||
from co_op import CoOp
|
||||
from zol_vebegadol import ZolVebegadol
|
||||
from rami_levi import RamiLevi
|
||||
from pathlib import Path
|
||||
|
||||
# TODO: fix problem of left-to-right printing
|
||||
@@ -16,8 +17,9 @@ Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True)
|
||||
|
||||
chain_dict = {
|
||||
'Shufersal': ShuferSal(),
|
||||
'Co-Op': CoOp(),
|
||||
'Zol-Vebegadol': ZolVebegadol()
|
||||
'CoOp': CoOp(),
|
||||
'Zol-Vebegadol': ZolVebegadol(),
|
||||
'RamiLevi': RamiLevi(),
|
||||
}
|
||||
|
||||
if __name__ == '__main__':
|
||||
@@ -75,7 +77,8 @@ if __name__ == '__main__':
|
||||
handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w',
|
||||
encoding='utf-8')
|
||||
logger.addHandler(handler)
|
||||
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain)
|
||||
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain,
|
||||
load_promos=args.load_promos)
|
||||
|
||||
elif args.price:
|
||||
get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])
|
||||
|
@@ -10,7 +10,7 @@ from utils import (
|
||||
)
|
||||
from supermarket_chain import SupermarketChain
|
||||
|
||||
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל']
|
||||
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'מגבת', 'מפות', 'פסטיגל', 'ביגי']
|
||||
|
||||
|
||||
class Promotion:
|
||||
@@ -102,7 +102,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
|
||||
chain.date_hour_format),
|
||||
end_date=datetime.strptime(promo.find(
|
||||
'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
|
||||
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format),
|
||||
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format),
|
||||
items=chain.get_items(promo, items_dict),
|
||||
)
|
||||
if is_valid_promo(promo):
|
||||
@@ -125,7 +125,7 @@ def is_valid_promo(promo: Promotion):
|
||||
return not_expired and has_started and has_products and not in_promo_ignore_list
|
||||
|
||||
|
||||
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
|
||||
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain, load_promos: bool):
|
||||
"""
|
||||
This function logs the available promotions in a store with a given id sorted by their update date.
|
||||
|
||||
@@ -135,7 +135,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket
|
||||
:param logger: A given logger
|
||||
"""
|
||||
|
||||
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False)
|
||||
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos)
|
||||
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
|
||||
promo.end_date), reverse=True)
|
||||
logger.info('\n'.join(str(promotion) for promotion in promotions))
|
||||
|
66
rami_levi.py
Normal file
66
rami_levi.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import json
|
||||
from typing import Dict, List
|
||||
import requests
|
||||
from bs4.element import Tag
|
||||
|
||||
from item import Item
|
||||
from supermarket_chain import SupermarketChain
|
||||
|
||||
|
||||
class RamiLevi(SupermarketChain):
|
||||
@property
|
||||
def promotion_tag_name(self):
|
||||
return 'Promotion'
|
||||
|
||||
@property
|
||||
def promotion_update_tag_name(self):
|
||||
return 'PromotionUpdateDate'
|
||||
|
||||
@property
|
||||
def date_format(self):
|
||||
return '%Y-%m-%d'
|
||||
|
||||
@property
|
||||
def date_hour_format(self):
|
||||
return '%Y-%m-%d %H:%M:%S'
|
||||
|
||||
@property
|
||||
def update_date_format(self):
|
||||
return '%Y-%m-%d %H:%M'
|
||||
|
||||
@property
|
||||
def item_tag_name(self):
|
||||
return 'Item'
|
||||
|
||||
@staticmethod
|
||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
|
||||
hostname = "https://publishedprices.co.il"
|
||||
|
||||
# Post the payload to the site to log in
|
||||
session.post(hostname + "/login/user", data={'username': 'ramilevi'})
|
||||
|
||||
# Scrape the data
|
||||
ajax_dir_payload = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
|
||||
s = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
|
||||
s_json = json.loads(s.text)
|
||||
suffix = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name'])
|
||||
|
||||
download_url = hostname + "/file/d/" + suffix
|
||||
print(download_url)
|
||||
return download_url
|
||||
|
||||
@staticmethod
|
||||
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||
items = list()
|
||||
for item in promo.find_all('Item'):
|
||||
item_code = item.find('ItemCode').text
|
||||
full_item_info = items_dict.get(item_code)
|
||||
if full_item_info:
|
||||
items.append(full_item_info)
|
||||
return items
|
||||
|
||||
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
|
||||
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
|
||||
|
||||
def __repr__(self):
|
||||
return 'RamiLevi'
|
@@ -14,8 +14,12 @@ class ShuferSal(SupermarketChain):
|
||||
date_hour_format = '%Y-%m-%d %H:%M'
|
||||
item_tag_name = 'Item'
|
||||
|
||||
@property
|
||||
def update_date_format(self):
|
||||
return ShuferSal.date_hour_format
|
||||
|
||||
@staticmethod
|
||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
|
||||
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
|
||||
if SupermarketChain.is_valid_store_id(int(store_id)):
|
||||
url += f"&storeId={store_id}"
|
||||
|
@@ -1,7 +1,10 @@
|
||||
import re
|
||||
from abc import abstractmethod
|
||||
from enum import Enum
|
||||
from argparse import ArgumentTypeError
|
||||
from typing import Dict, List
|
||||
|
||||
import requests
|
||||
from bs4.element import Tag
|
||||
|
||||
from item import Item
|
||||
@@ -35,6 +38,10 @@ class SupermarketChain:
|
||||
@abstractmethod
|
||||
def date_hour_format(self): pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def update_date_format(self): pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def item_tag_name(self): pass
|
||||
@@ -62,10 +69,11 @@ class SupermarketChain:
|
||||
|
||||
@staticmethod
|
||||
@abstractmethod
|
||||
def get_download_url(store_id: int, category: XMLFilesCategory) -> str:
|
||||
def get_download_url(store_id: int, category: XMLFilesCategory, session: requests.Session) -> str:
|
||||
"""
|
||||
This method scrapes supermarket's website and returns a url containing the data for a given store and category.
|
||||
|
||||
:param session:
|
||||
:param store_id: A given id of a store
|
||||
:param category: A given category
|
||||
:return: A downloadable link of the data for a given store and category
|
||||
@@ -91,6 +99,18 @@ class SupermarketChain:
|
||||
return [item.find('ItemCode').text for item in promo.find_all('Item')
|
||||
if not items_dict.get(item.find('ItemCode').text)]
|
||||
|
||||
@staticmethod
|
||||
def get_item_info(item: Tag) -> Item:
|
||||
"""
|
||||
This function returns a string containing important information about a given supermarket's product.
|
||||
"""
|
||||
return Item(
|
||||
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
|
||||
price=item.find('ItemPrice').text,
|
||||
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
|
||||
code=item.find('ItemCode').text
|
||||
)
|
||||
|
||||
@abstractmethod
|
||||
def __repr__(self):
|
||||
pass
|
||||
|
30
utils.py
30
utils.py
@@ -4,12 +4,10 @@ import zipfile
|
||||
from typing import AnyStr, Dict
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import Tag
|
||||
from os import path
|
||||
|
||||
from item import Item
|
||||
from supermarket_chain import SupermarketChain
|
||||
import re
|
||||
|
||||
RESULTS_DIRNAME = "results"
|
||||
RAW_FILES_DIRNAME = "raw_files"
|
||||
@@ -60,8 +58,9 @@ def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category:
|
||||
:param category: A given category
|
||||
:return: A BeautifulSoup object with xml content.
|
||||
"""
|
||||
download_url: str = chain.get_download_url(store_id, category)
|
||||
response_content = requests.get(download_url).content
|
||||
session = requests.Session()
|
||||
download_url: str = chain.get_download_url(store_id, category, session)
|
||||
response_content = session.get(download_url).content
|
||||
try:
|
||||
xml_content: AnyStr = gzip.decompress(response_content)
|
||||
except gzip.BadGzipFile:
|
||||
@@ -95,19 +94,7 @@ def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[
|
||||
"""
|
||||
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
|
||||
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
|
||||
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
|
||||
|
||||
|
||||
def get_item_info(item: Tag) -> Item:
|
||||
"""
|
||||
This function returns a string containing important information about a given supermarket's product.
|
||||
"""
|
||||
return Item(
|
||||
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
|
||||
price=item.find('ItemPrice').text,
|
||||
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
|
||||
code=item.find('ItemCode').text
|
||||
)
|
||||
return {item.find('ItemCode').text: chain.get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
|
||||
|
||||
|
||||
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
|
||||
@@ -124,5 +111,10 @@ def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool,
|
||||
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
|
||||
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
|
||||
for prod in prods:
|
||||
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
|
||||
prod.find('ItemPrice').text))
|
||||
print(
|
||||
(
|
||||
prod.find('ItemName').text[::-1],
|
||||
prod.find('ManufacturerName').text[::-1],
|
||||
prod.find('ItemPrice').text
|
||||
)
|
||||
)
|
||||
|
@@ -20,8 +20,12 @@ class ZolVebegadol(SupermarketChain):
|
||||
date_hour_format = '%Y-%m-%d %H:%M:%S'
|
||||
item_tag_name = 'Item'
|
||||
|
||||
@property
|
||||
def update_date_format(self):
|
||||
return ZolVebegadol.date_hour_format
|
||||
|
||||
@staticmethod
|
||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session:requests.Session) -> str:
|
||||
prefix = "http://zolvebegadol.binaprojects.com"
|
||||
url = prefix + "/MainIO_Hok.aspx"
|
||||
req_res: requests.Response = requests.get(url)
|
||||
|
Reference in New Issue
Block a user