From cffdd84086cd439f4923dd0fd29854c7e76c04c4 Mon Sep 17 00:00:00 2001 From: KorenLazar Date: Tue, 17 Aug 2021 13:06:42 +0300 Subject: [PATCH] Added specific searching for the download url of non-full promotions and prices files. Changed return value of get_download_url accordingly. --- chains/binaproject_web_client.py | 16 ++++++++++++++-- chains/cerberus_web_client.py | 10 +++++++++- chains/mahsaneiHashook.py | 13 +++++++++++-- utils.py | 2 ++ 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/chains/binaproject_web_client.py b/chains/binaproject_web_client.py index 8f5ecb7..ec23e4e 100644 --- a/chains/binaproject_web_client.py +++ b/chains/binaproject_web_client.py @@ -1,8 +1,12 @@ import json +import re + import requests from supermarket_chain import SupermarketChain +FNAME_KEY = "FileNm" + class BinaProjectWebClient: _date_hour_format = '%Y-%m-%d %H:%M:%S' @@ -16,8 +20,16 @@ class BinaProjectWebClient: url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"]) req_res: requests.Response = session.get(url) jsons_files = json.loads(req_res.text) - suffix = next(cur_json["FileNm"] for cur_json in jsons_files if f'-{store_id:03d}-20' in cur_json["FileNm"] - and category.name.replace('s', '') in cur_json["FileNm"]) + + if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]: + filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname \ + and not re.search('full', fname, re.IGNORECASE) + if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files): + return "" # Could not find non-full Promos/Prices file + else: + filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname + suffix = next( + cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY])) down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix]) return down_url diff --git a/chains/cerberus_web_client.py b/chains/cerberus_web_client.py index ae8ef98..f02f75e 100644 --- a/chains/cerberus_web_client.py +++ b/chains/cerberus_web_client.py @@ -1,4 +1,6 @@ import json +import re + import requests from supermarket_chain import SupermarketChain @@ -17,7 +19,13 @@ class CerberusWebClient: ajax_dir_payload: dict = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')} s: requests.Response = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload) s_json: dict = json.loads(s.text) - suffix: str = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name']) + if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]: + filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] and not re.search('full', d['name'], re.IGNORECASE) + if not any(filter_func(d, store_id) for d in s_json['aaData']): + return "" # Could not find non-full Prices/Promos file + else: + filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] + suffix: str = next(d['name'] for d in s_json['aaData'] if filter_func(d, store_id)) download_url: str = hostname + "/file/d/" + suffix return download_url diff --git a/chains/mahsaneiHashook.py b/chains/mahsaneiHashook.py index 9cb7c5b..b11e387 100644 --- a/chains/mahsaneiHashook.py +++ b/chains/mahsaneiHashook.py @@ -1,3 +1,4 @@ +import re from typing import Dict, List import requests from bs4 import BeautifulSoup @@ -21,8 +22,16 @@ class MahsaneiHashook(SupermarketChain): url = prefix + "NBCompetitionRegulations.aspx" req_res: requests.Response = requests.get(url) soup = BeautifulSoup(req_res.text, features='lxml') - suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value - and f'-{store_id:03d}-20' in value).attrs['href'] + if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]: + fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \ + and f'-{store_id:03d}-20' in fname \ + and not re.search('full', fname, re.IGNORECASE) + if soup.find('a', href=fname_filter_func) is None: + return "" # Could not find non-full Promos/Prices file + else: + fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \ + and f'-{store_id:03d}-20' in fname + suffix: str = soup.find('a', href=fname_filter_func).attrs['href'] down_url: str = prefix + suffix return down_url diff --git a/utils.py b/utils.py index 4fa2c75..4d3e3b2 100644 --- a/utils.py +++ b/utils.py @@ -66,6 +66,8 @@ def get_bs_object_from_link(chain: SupermarketChain, store_id: int, category: Su """ session = requests.Session() download_url: str = chain.get_download_url(store_id, category, session) + if not download_url: + return BeautifulSoup() response_content = session.get(download_url).content try: xml_content: AnyStr = gzip.decompress(response_content)