From cffdd84086cd439f4923dd0fd29854c7e76c04c4 Mon Sep 17 00:00:00 2001
From: KorenLazar <koren507@cs.huji.ac.il>
Date: Tue, 17 Aug 2021 13:06:42 +0300
Subject: [PATCH] Added specific searching for the download url of non-full
 promotions and prices files. Changed return value of get_download_url
 accordingly.

---
 chains/binaproject_web_client.py | 16 ++++++++++++++--
 chains/cerberus_web_client.py    | 10 +++++++++-
 chains/mahsaneiHashook.py        | 13 +++++++++++--
 utils.py                         |  2 ++
 4 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/chains/binaproject_web_client.py b/chains/binaproject_web_client.py
index 8f5ecb7..ec23e4e 100644
--- a/chains/binaproject_web_client.py
+++ b/chains/binaproject_web_client.py
@@ -1,8 +1,12 @@
 import json
+import re
+
 import requests
 
 from supermarket_chain import SupermarketChain
 
+FNAME_KEY = "FileNm"
+
 
 class BinaProjectWebClient:
     _date_hour_format = '%Y-%m-%d %H:%M:%S'
@@ -16,8 +20,16 @@ class BinaProjectWebClient:
         url = '/'.join([hostname, self.path_prefix, "MainIO_Hok.aspx"])
         req_res: requests.Response = session.get(url)
         jsons_files = json.loads(req_res.text)
-        suffix = next(cur_json["FileNm"] for cur_json in jsons_files if f'-{store_id:03d}-20' in cur_json["FileNm"]
-                      and category.name.replace('s', '') in cur_json["FileNm"])
+
+        if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
+            filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname \
+                                        and not re.search('full', fname, re.IGNORECASE)
+            if not any(filter_func(cur_json[FNAME_KEY]) for cur_json in jsons_files):
+                return ""  # Could not find non-full Promos/Prices file
+        else:
+            filter_func = lambda fname: f'-{store_id:03d}-20' in fname and category.name.replace('s', '') in fname
+        suffix = next(
+            cur_json[FNAME_KEY] for cur_json in jsons_files if filter_func(cur_json[FNAME_KEY]))
         down_url: str = '/'.join([hostname, self.path_prefix, "Download", suffix])
         return down_url
 
diff --git a/chains/cerberus_web_client.py b/chains/cerberus_web_client.py
index ae8ef98..f02f75e 100644
--- a/chains/cerberus_web_client.py
+++ b/chains/cerberus_web_client.py
@@ -1,4 +1,6 @@
 import json
+import re
+
 import requests
 
 from supermarket_chain import SupermarketChain
@@ -17,7 +19,13 @@ class CerberusWebClient:
         ajax_dir_payload: dict = {'iDisplayLength': 100000, 'sSearch': category.name.replace('s', '')}
         s: requests.Response = session.post(hostname + "/file/ajax_dir", data=ajax_dir_payload)
         s_json: dict = json.loads(s.text)
-        suffix: str = next(d['name'] for d in s_json['aaData'] if f'-{store_id:03d}-20' in d['name'])
+        if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
+            filter_func = lambda d, id: f'-{id:03d}-20' in d['name'] and not re.search('full', d['name'], re.IGNORECASE)
+            if not any(filter_func(d, store_id) for d in s_json['aaData']):
+                return ""  # Could not find non-full Prices/Promos file
+        else:
+            filter_func = lambda d, id: f'-{id:03d}-20' in d['name']
+        suffix: str = next(d['name'] for d in s_json['aaData'] if filter_func(d, store_id))
 
         download_url: str = hostname + "/file/d/" + suffix
         return download_url
diff --git a/chains/mahsaneiHashook.py b/chains/mahsaneiHashook.py
index 9cb7c5b..b11e387 100644
--- a/chains/mahsaneiHashook.py
+++ b/chains/mahsaneiHashook.py
@@ -1,3 +1,4 @@
+import re
 from typing import Dict, List
 import requests
 from bs4 import BeautifulSoup
@@ -21,8 +22,16 @@ class MahsaneiHashook(SupermarketChain):
         url = prefix + "NBCompetitionRegulations.aspx"
         req_res: requests.Response = requests.get(url)
         soup = BeautifulSoup(req_res.text, features='lxml')
-        suffix: str = soup.find('a', href=lambda value: value and category.name.replace('s', '') in value
-                                and f'-{store_id:03d}-20' in value).attrs['href']
+        if category in [SupermarketChain.XMLFilesCategory.Promos, SupermarketChain.XMLFilesCategory.Prices]:
+            fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
+                                              and f'-{store_id:03d}-20' in fname \
+                                              and not re.search('full', fname, re.IGNORECASE)
+            if soup.find('a', href=fname_filter_func) is None:
+                return ""  # Could not find non-full Promos/Prices file
+        else:
+            fname_filter_func = lambda fname: fname and category.name.replace('s', '') in fname \
+                                              and f'-{store_id:03d}-20' in fname
+        suffix: str = soup.find('a', href=fname_filter_func).attrs['href']
         down_url: str = prefix + suffix
         return down_url
 
diff --git a/utils.py b/utils.py
index 4fa2c75..4d3e3b2 100644
--- a/utils.py
+++ b/utils.py
@@ -66,6 +66,8 @@ def get_bs_object_from_link(chain: SupermarketChain, store_id: int, category: Su
     """
     session = requests.Session()
     download_url: str = chain.get_download_url(store_id, category, session)
+    if not download_url:
+        return BeautifulSoup()
     response_content = session.get(download_url).content
     try:
         xml_content: AnyStr = gzip.decompress(response_content)