has added RamiLevi to the chains collection
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
# Supermarket basic scraping
|
# Supermarket basic scraping
|
||||||
The library supports scraping from Shufersal, Co-Op and Zol Vebegadol.
|
The library supports scraping from Shufersal, CoOp and Zol Vebegadol.
|
||||||
|
|
||||||
## Installation
|
## Installation
|
||||||
clone:
|
clone:
|
||||||
|
11
co_op.py
11
co_op.py
@@ -8,20 +8,25 @@ from supermarket_chain import SupermarketChain
|
|||||||
|
|
||||||
|
|
||||||
class CoOp(SupermarketChain):
|
class CoOp(SupermarketChain):
|
||||||
|
|
||||||
promotion_tag_name = 'Sale'
|
promotion_tag_name = 'Sale'
|
||||||
promotion_update_tag_name = 'PriceUpdateDate'
|
promotion_update_tag_name = 'PriceUpdateDate'
|
||||||
date_format = '%Y/%m/%d'
|
date_format = '%Y/%m/%d'
|
||||||
date_hour_format = '%Y/%m/%d %H:%M:%S'
|
date_hour_format = '%Y/%m/%d %H:%M:%S'
|
||||||
item_tag_name = 'Product'
|
item_tag_name = 'Product'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def update_date_format(self):
|
||||||
|
return CoOp.date_hour_format
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
|
||||||
prefix = "http://matrixcatalog.co.il/"
|
prefix = "http://matrixcatalog.co.il/"
|
||||||
url = prefix + "NBCompetitionRegulations.aspx"
|
url = prefix + "NBCompetitionRegulations.aspx"
|
||||||
req_res: requests.Response = requests.get(url)
|
req_res: requests.Response = requests.get(url)
|
||||||
soup = BeautifulSoup(req_res.text, features='lxml')
|
soup = BeautifulSoup(req_res.text, features='lxml')
|
||||||
suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
|
suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
|
||||||
and f'-{store_id:03d}-20' in value).attrs['href']
|
and f'-{store_id:03d}-20' in value).attrs['href']
|
||||||
down_url = prefix + suffix
|
down_url = prefix + suffix
|
||||||
print(down_url)
|
print(down_url)
|
||||||
return down_url
|
return down_url
|
||||||
@@ -30,7 +35,7 @@ class CoOp(SupermarketChain):
|
|||||||
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
|
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
|
||||||
|
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
return 'Co-Op'
|
return 'CoOp'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||||
|
9
main.py
9
main.py
@@ -7,6 +7,7 @@ from supermarket_chain import SupermarketChain
|
|||||||
from shufersal import ShuferSal
|
from shufersal import ShuferSal
|
||||||
from co_op import CoOp
|
from co_op import CoOp
|
||||||
from zol_vebegadol import ZolVebegadol
|
from zol_vebegadol import ZolVebegadol
|
||||||
|
from rami_levi import RamiLevi
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# TODO: fix problem of left-to-right printing
|
# TODO: fix problem of left-to-right printing
|
||||||
@@ -16,8 +17,9 @@ Path(RAW_FILES_DIRNAME).mkdir(exist_ok=True)
|
|||||||
|
|
||||||
chain_dict = {
|
chain_dict = {
|
||||||
'Shufersal': ShuferSal(),
|
'Shufersal': ShuferSal(),
|
||||||
'Co-Op': CoOp(),
|
'CoOp': CoOp(),
|
||||||
'Zol-Vebegadol': ZolVebegadol()
|
'Zol-Vebegadol': ZolVebegadol(),
|
||||||
|
'RamiLevi': RamiLevi(),
|
||||||
}
|
}
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
@@ -75,7 +77,8 @@ if __name__ == '__main__':
|
|||||||
handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w',
|
handler = logging.FileHandler(filename=f'{RESULTS_DIRNAME}/{args.chain}_promos_{arg_store_id}.log', mode='w',
|
||||||
encoding='utf-8')
|
encoding='utf-8')
|
||||||
logger.addHandler(handler)
|
logger.addHandler(handler)
|
||||||
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain)
|
main_latest_promos(store_id=arg_store_id, load_xml=args.load_prices, logger=logger, chain=chain,
|
||||||
|
load_promos=args.load_promos)
|
||||||
|
|
||||||
elif args.price:
|
elif args.price:
|
||||||
get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])
|
get_products_prices(chain, store_id=args.price[0], load_xml=args.load_prices, product_name=args.price[1])
|
||||||
|
@@ -10,7 +10,7 @@ from utils import (
|
|||||||
)
|
)
|
||||||
from supermarket_chain import SupermarketChain
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'צלחות', 'כוסות', 'מאגים', 'מגבת', 'מפות', 'פסטיגל']
|
PRODUCTS_TO_IGNORE = ['סירים', 'מגבות', 'מגבת', 'מפות', 'פסטיגל', 'ביגי']
|
||||||
|
|
||||||
|
|
||||||
class Promotion:
|
class Promotion:
|
||||||
@@ -102,7 +102,7 @@ def get_available_promos(chain: SupermarketChain, store_id: int, load_prices: bo
|
|||||||
chain.date_hour_format),
|
chain.date_hour_format),
|
||||||
end_date=datetime.strptime(promo.find(
|
end_date=datetime.strptime(promo.find(
|
||||||
'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
|
'PromotionEndDate').text + ' ' + promo.find('PromotionEndHour').text, chain.date_hour_format),
|
||||||
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.date_hour_format),
|
update_date=datetime.strptime(promo.find(chain.promotion_update_tag_name).text, chain.update_date_format),
|
||||||
items=chain.get_items(promo, items_dict),
|
items=chain.get_items(promo, items_dict),
|
||||||
)
|
)
|
||||||
if is_valid_promo(promo):
|
if is_valid_promo(promo):
|
||||||
@@ -125,7 +125,7 @@ def is_valid_promo(promo: Promotion):
|
|||||||
return not_expired and has_started and has_products and not in_promo_ignore_list
|
return not_expired and has_started and has_products and not in_promo_ignore_list
|
||||||
|
|
||||||
|
|
||||||
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain):
|
def main_latest_promos(store_id: int, load_xml: bool, logger, chain: SupermarketChain, load_promos: bool):
|
||||||
"""
|
"""
|
||||||
This function logs the available promotions in a store with a given id sorted by their update date.
|
This function logs the available promotions in a store with a given id sorted by their update date.
|
||||||
|
|
||||||
@@ -135,7 +135,7 @@ def main_latest_promos(store_id: int, load_xml: bool, logger, chain: Supermarket
|
|||||||
:param logger: A given logger
|
:param logger: A given logger
|
||||||
"""
|
"""
|
||||||
|
|
||||||
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, False)
|
promotions: List[Promotion] = get_available_promos(chain, store_id, load_xml, load_promos)
|
||||||
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
|
promotions.sort(key=lambda promo: (max(promo.update_date.date(), promo.start_date.date()), promo.start_date -
|
||||||
promo.end_date), reverse=True)
|
promo.end_date), reverse=True)
|
||||||
logger.info('\n'.join(str(promotion) for promotion in promotions))
|
logger.info('\n'.join(str(promotion) for promotion in promotions))
|
||||||
|
66
rami_levi.py
Normal file
66
rami_levi.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
import json
|
||||||
|
from typing import Dict, List
|
||||||
|
import requests
|
||||||
|
from bs4.element import Tag
|
||||||
|
|
||||||
|
from item import Item
|
||||||
|
from supermarket_chain import SupermarketChain
|
||||||
|
|
||||||
|
|
||||||
|
class RamiLevi(SupermarketChain):
|
||||||
|
@property
|
||||||
|
def promotion_tag_name(self):
|
||||||
|
return 'Promotion'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def promotion_update_tag_name(self):
|
||||||
|
return 'PromotionUpdateDate'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def date_format(self):
|
||||||
|
return '%Y-%m-%d'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def date_hour_format(self):
|
||||||
|
return '%Y-%m-%d %H:%M:%S'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def update_date_format(self):
|
||||||
|
return '%Y-%m-%d %H:%M'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def item_tag_name(self):
|
||||||
|
return 'Item'
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
|
||||||
|
hostname = "https://publishedprices.co.il"
|
||||||
|
|
||||||
|
# Post the payload to the site to log in
|
||||||
|
session.post(hostname + "/login/user", data={'username': 'ramilevi'})
|
||||||
|
|
||||||
|
# Scrape the data
|
||||||
|
ajax_dir_payload = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
|
||||||
|
s = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
|
||||||
|
s_json = json.loads(s.text)
|
||||||
|
suffix = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name'])
|
||||||
|
|
||||||
|
download_url = hostname + "/file/d/" + suffix
|
||||||
|
print(download_url)
|
||||||
|
return download_url
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_items(promo: Tag, items_dict: Dict[str, Item]) -> List[Item]:
|
||||||
|
items = list()
|
||||||
|
for item in promo.find_all('Item'):
|
||||||
|
item_code = item.find('ItemCode').text
|
||||||
|
full_item_info = items_dict.get(item_code)
|
||||||
|
if full_item_info:
|
||||||
|
items.append(full_item_info)
|
||||||
|
return items
|
||||||
|
|
||||||
|
class XMLFilesCategory(SupermarketChain.XMLFilesCategory):
|
||||||
|
All, Promos, PromosFull, Prices, PricesFull, Stores = range(6)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return 'RamiLevi'
|
@@ -14,8 +14,12 @@ class ShuferSal(SupermarketChain):
|
|||||||
date_hour_format = '%Y-%m-%d %H:%M'
|
date_hour_format = '%Y-%m-%d %H:%M'
|
||||||
item_tag_name = 'Item'
|
item_tag_name = 'Item'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def update_date_format(self):
|
||||||
|
return ShuferSal.date_hour_format
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session: requests.Session) -> str:
|
||||||
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
|
url = f"http://prices.shufersal.co.il/FileObject/UpdateCategory?catID={category.value}"
|
||||||
if SupermarketChain.is_valid_store_id(int(store_id)):
|
if SupermarketChain.is_valid_store_id(int(store_id)):
|
||||||
url += f"&storeId={store_id}"
|
url += f"&storeId={store_id}"
|
||||||
|
@@ -1,7 +1,10 @@
|
|||||||
|
import re
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from argparse import ArgumentTypeError
|
from argparse import ArgumentTypeError
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
|
import requests
|
||||||
from bs4.element import Tag
|
from bs4.element import Tag
|
||||||
|
|
||||||
from item import Item
|
from item import Item
|
||||||
@@ -35,6 +38,10 @@ class SupermarketChain:
|
|||||||
@abstractmethod
|
@abstractmethod
|
||||||
def date_hour_format(self): pass
|
def date_hour_format(self): pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abstractmethod
|
||||||
|
def update_date_format(self): pass
|
||||||
|
|
||||||
@property
|
@property
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def item_tag_name(self): pass
|
def item_tag_name(self): pass
|
||||||
@@ -62,10 +69,11 @@ class SupermarketChain:
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def get_download_url(store_id: int, category: XMLFilesCategory) -> str:
|
def get_download_url(store_id: int, category: XMLFilesCategory, session: requests.Session) -> str:
|
||||||
"""
|
"""
|
||||||
This method scrapes supermarket's website and returns a url containing the data for a given store and category.
|
This method scrapes supermarket's website and returns a url containing the data for a given store and category.
|
||||||
|
|
||||||
|
:param session:
|
||||||
:param store_id: A given id of a store
|
:param store_id: A given id of a store
|
||||||
:param category: A given category
|
:param category: A given category
|
||||||
:return: A downloadable link of the data for a given store and category
|
:return: A downloadable link of the data for a given store and category
|
||||||
@@ -91,6 +99,18 @@ class SupermarketChain:
|
|||||||
return [item.find('ItemCode').text for item in promo.find_all('Item')
|
return [item.find('ItemCode').text for item in promo.find_all('Item')
|
||||||
if not items_dict.get(item.find('ItemCode').text)]
|
if not items_dict.get(item.find('ItemCode').text)]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def get_item_info(item: Tag) -> Item:
|
||||||
|
"""
|
||||||
|
This function returns a string containing important information about a given supermarket's product.
|
||||||
|
"""
|
||||||
|
return Item(
|
||||||
|
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
|
||||||
|
price=item.find('ItemPrice').text,
|
||||||
|
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
|
||||||
|
code=item.find('ItemCode').text
|
||||||
|
)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def __repr__(self):
|
def __repr__(self):
|
||||||
pass
|
pass
|
||||||
|
30
utils.py
30
utils.py
@@ -4,12 +4,10 @@ import zipfile
|
|||||||
from typing import AnyStr, Dict
|
from typing import AnyStr, Dict
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from bs4.element import Tag
|
|
||||||
from os import path
|
from os import path
|
||||||
|
|
||||||
from item import Item
|
from item import Item
|
||||||
from supermarket_chain import SupermarketChain
|
from supermarket_chain import SupermarketChain
|
||||||
import re
|
|
||||||
|
|
||||||
RESULTS_DIRNAME = "results"
|
RESULTS_DIRNAME = "results"
|
||||||
RAW_FILES_DIRNAME = "raw_files"
|
RAW_FILES_DIRNAME = "raw_files"
|
||||||
@@ -60,8 +58,9 @@ def create_bs_object_from_link(xml_path: str, chain: SupermarketChain, category:
|
|||||||
:param category: A given category
|
:param category: A given category
|
||||||
:return: A BeautifulSoup object with xml content.
|
:return: A BeautifulSoup object with xml content.
|
||||||
"""
|
"""
|
||||||
download_url: str = chain.get_download_url(store_id, category)
|
session = requests.Session()
|
||||||
response_content = requests.get(download_url).content
|
download_url: str = chain.get_download_url(store_id, category, session)
|
||||||
|
response_content = session.get(download_url).content
|
||||||
try:
|
try:
|
||||||
xml_content: AnyStr = gzip.decompress(response_content)
|
xml_content: AnyStr = gzip.decompress(response_content)
|
||||||
except gzip.BadGzipFile:
|
except gzip.BadGzipFile:
|
||||||
@@ -95,19 +94,7 @@ def create_items_dict(chain: SupermarketChain, load_xml, store_id: int) -> Dict[
|
|||||||
"""
|
"""
|
||||||
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
|
xml_path: str = xml_file_gen(chain, store_id, chain.XMLFilesCategory.PricesFull.name)
|
||||||
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
|
bs_prices: BeautifulSoup = create_bs_object(xml_path, chain, store_id, load_xml, chain.XMLFilesCategory.PricesFull)
|
||||||
return {item.find('ItemCode').text: get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
|
return {item.find('ItemCode').text: chain.get_item_info(item) for item in bs_prices.find_all(chain.item_tag_name)}
|
||||||
|
|
||||||
|
|
||||||
def get_item_info(item: Tag) -> Item:
|
|
||||||
"""
|
|
||||||
This function returns a string containing important information about a given supermarket's product.
|
|
||||||
"""
|
|
||||||
return Item(
|
|
||||||
name=item.find(re.compile(r'ItemN[a]?m[e]?')).text,
|
|
||||||
price=item.find('ItemPrice').text,
|
|
||||||
manufacturer=item.find(re.compile(r'Manufacture[r]?Name')).text,
|
|
||||||
code=item.find('ItemCode').text
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
|
def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool, product_name: str) -> None:
|
||||||
@@ -124,5 +111,10 @@ def get_products_prices(chain: SupermarketChain, store_id: int, load_xml: bool,
|
|||||||
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
|
prods = [item for item in bs_prices.find_all("Item") if product_name in item.find("ItemName").text]
|
||||||
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
|
prods.sort(key=lambda x: float(x.find("UnitOfMeasurePrice").text))
|
||||||
for prod in prods:
|
for prod in prods:
|
||||||
print((prod.find('ItemName').text[::-1], prod.find('ManufacturerName').text[::-1],
|
print(
|
||||||
prod.find('ItemPrice').text))
|
(
|
||||||
|
prod.find('ItemName').text[::-1],
|
||||||
|
prod.find('ManufacturerName').text[::-1],
|
||||||
|
prod.find('ItemPrice').text
|
||||||
|
)
|
||||||
|
)
|
||||||
|
@@ -20,8 +20,12 @@ class ZolVebegadol(SupermarketChain):
|
|||||||
date_hour_format = '%Y-%m-%d %H:%M:%S'
|
date_hour_format = '%Y-%m-%d %H:%M:%S'
|
||||||
item_tag_name = 'Item'
|
item_tag_name = 'Item'
|
||||||
|
|
||||||
|
@property
|
||||||
|
def update_date_format(self):
|
||||||
|
return ZolVebegadol.date_hour_format
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory) -> str:
|
def get_download_url(store_id: int, category: SupermarketChain.XMLFilesCategory, session:requests.Session) -> str:
|
||||||
prefix = "http://zolvebegadol.binaprojects.com"
|
prefix = "http://zolvebegadol.binaprojects.com"
|
||||||
url = prefix + "/MainIO_Hok.aspx"
|
url = prefix + "/MainIO_Hok.aspx"
|
||||||
req_res: requests.Response = requests.get(url)
|
req_res: requests.Response = requests.get(url)
|
||||||
|
Reference in New Issue
Block a user