From 92bb5b2e85ae07a8da1ea5066e681ac80660f862 Mon Sep 17 00:00:00 2001
From: Elmar Kresse <elmar.kresse@stud.htwk-leipzig.de>
Date: Mon, 10 Feb 2025 10:02:53 +0100
Subject: [PATCH] feat: update Docker setup, add environment variables, and
 enhance property scraping logic

---
 .gitignore              |  3 ++-
 Dockerfile              |  4 ++-
 docker-compose.yml      |  6 ++++-
 known_properties.json   |  2 +-
 main.py                 | 25 ++++++++++++------
 requirements.txt        | 56 +++--------------------------------------
 sample.env              |  3 +++
 src/bgl/bgl.py          | 17 ++++++++++---
 src/discord/webhook.py  |  7 +++---
 src/lwb/scrape_image.py |  9 ++++++-
 src/lwb/scraper.py      |  8 ++++--
 11 files changed, 66 insertions(+), 74 deletions(-)
 create mode 100644 sample.env

diff --git a/.gitignore b/.gitignore
index 37d5c94..07765b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 .venv
 *.pyc
-__pycache__
\ No newline at end of file
+__pycache__
+.env
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index c38f1f9..b5f5768 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -7,6 +7,8 @@ WORKDIR /app
 
 RUN pip install -r requirements.txt
 
-COPY . /app
+COPY ./main.py /app
+COPY ./known_properties.json /app
+COPY ./src/. /app/src
 
 CMD ["python", "-u", "/app/main.py"]
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 7c78c19..cbc0d9d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,5 +1,9 @@
 services:
   python:
-    image: gitlab.dit.htwk-leipzig.de:5050/ekresse/flatscraper:main
+    build:
+      context: .
+      dockerfile: Dockerfile
     container_name: flatscraper
+    env_file:
+      - .env
     
\ No newline at end of file
diff --git a/known_properties.json b/known_properties.json
index c1abe17..35e856b 100644
--- a/known_properties.json
+++ b/known_properties.json
@@ -1 +1 @@
-[51968, "172DFAD2-7CDB-51B4-212E-E6F9C7F0601A", "FA45C0B6-813C-DE65-496A-EDD8DA3F2526", "9A0B42A2-6D9B-331A-DAA7-624A5FA49606", "F7992488-7C24-DFA9-F8B2-94DDC18E66A3", "C02892BE-F34F-5A8A-E174-4A79549DC9A9", "id-32-1-41", "21C20126-380D-9B0F-73F8-C4279897F189", "BAAEC20A-F667-FE22-6693-E4B4CA366889", "665243A8-FD34-86F8-322F-FE9B0B392083", -4432880139075606878, 6598001214168193439, 3637269161070974884, 51624, 3048296312984895144, "E7B71D28-C557-CFE4-805D-42C2793E9248", 51628, 3286076625991780524, 52087, "id-193-1-13", 51632, "id-105-12-78", "id-88-5-45", "57446DF5-CB9F-951C-A40A-4BA775DA7426", "id-19-3-19", "B4095706-A65C-F421-B02E-1D227B684B62", 9131742071665368621, "id-58-4-30", "892BD779-F186-9BD1-A97A-5783EFB6F56D", "id-154-2-71", -6815659680085605823, 51656, "id-158-2-18", 51660, 51665, 213337995196486226, "id-145-1-9", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "id-148-1-30", "B57516F9-E364-7E54-A211-527ED54388E6", "3B73B720-13F2-62A4-8829-557676725A95", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "281A7F0B-FA5A-75AE-68DE-89AFD90EEF48", "id-202-1-11", "id-147-1-1", 6615894728235425776, "id-30-5-36", "A979EBAE-EF87-FB51-152A-5453CD7DC794", -9017525000097635974, 51963]
\ No newline at end of file
+["b36ab45e1c5899b4c9add5b1134e731e6c97858fb7de80373df925c2246c824d", "449c368b2b7bca515840659acd6702071fa62e59917ba7a29a180d49ca19ad82", "id-58-4-30", "8901.039", "2702.004", "id-148-1-30", "7e59929357819219851c5dd0f4addb2c6d472fe5e1001b1c6a0021597d6ec4cc", "fe1245cb4d60d639f47edbf5b4ccf44c2639595723ef412d059760b3c3b075e2", "3207.006", "8ee618d09d89849e48da7a66efafe046f52c833baa08babbd98d8574578a259f", "e804d28f1dea007b5a3ff762dfdd7cc29fb1f42104ac0f15bfac74b9254174bd", "10104.013", 52087, "dc552b2bfb958b166d98425b8ee43420130a8eaea7e494da003fb1f7734cb1fb", "1809676f5b586fb5ad89ae3ec6082c45dad170be9b67c18fa887cd97df0b375c", "id-158-2-18", "9101.046"]
\ No newline at end of file
diff --git a/main.py b/main.py
index 7263b23..a0ad3e6 100644
--- a/main.py
+++ b/main.py
@@ -27,14 +27,23 @@ def main():
     while True:
         current_time = time.strftime("%H:%M:%S", time.localtime())
         print("Scraping properties at " + current_time)
-        print("Scraping properties from Wogetra...")
-        properties = wogetra_scraper.scrape_wogetra()
-        print("Scraping properties from LWB...")
-        properties += lwb_scraper.scrape_easysquare()
-        print("Scraping properties from Lipsia...")
-        properties += lipsia_scraper.scrape_lipsia()
-        print("Scraping properties from BGL...")
-        properties = bgl_scraper.fetch_all_properties()
+        
+        properties_wogetra = wogetra_scraper.scrape_wogetra()
+        print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra")
+        properties = properties_wogetra
+        
+        properties_lwb = lwb_scraper.scrape_easysquare()
+        print("Scraped " + str(len(properties_lwb)) + " properties from LWB")
+        properties += properties_lwb
+
+        properties_lipsia = lipsia_scraper.scrape_lipsia()
+        print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia")
+        properties += properties_lipsia
+        
+        properties_bgl = bgl_scraper.fetch_all_properties()
+        print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
+        properties += properties_bgl
+
 
         for prop in properties:
             if prop["id"] not in known_properties:
diff --git a/requirements.txt b/requirements.txt
index 6563100..7593449 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,59 +1,9 @@
-aiohappyeyeballs==2.4.4
-aiohttp==3.11.11
-aiosignal==1.3.2
-asttokens==3.0.0
-async-timeout==5.0.1
-attrs==24.3.0
-backcall==0.2.0
-beautifulsoup4==4.12.3
-bleach==6.2.0
-certifi==2024.12.14
+beautifulsoup4==4.13.3
+certifi==2025.1.31
 charset-normalizer==3.4.1
-colorama==0.4.6
-decorator==5.1.1
-defusedxml==0.7.1
-discord.py==2.4.0
-docopt==0.6.2
-executing==2.1.0
-fastjsonschema==2.21.1
-frozenlist==1.5.0
 idna==3.10
-ipython==8.12.3
-jedi==0.19.2
-Jinja2==3.1.5
-jsonschema==4.23.0
-jsonschema-specifications==2024.10.1
-MarkupSafe==3.0.2
-matplotlib-inline==0.1.7
-mistune==3.1.0
-multidict==6.1.0
-nbclient==0.10.2
-nbconvert==7.16.5
-nbformat==5.10.4
-packaging==24.2
-pandocfilters==1.5.1
-parso==0.8.4
-pickleshare==0.7.5
-platformdirs==4.3.6
-prompt_toolkit==3.0.48
-propcache==0.2.1
-pure_eval==0.2.3
-Pygments==2.19.1
-python-dateutil==2.9.0.post0
-pyzmq==26.2.0
-referencing==0.35.1
+python-dotenv==1.0.1
 requests==2.32.3
-rpds-py==0.22.3
-schedule==1.2.2
-six==1.17.0
 soupsieve==2.6
-stack-data==0.6.3
-tinycss2==1.4.0
-tornado==6.4.2
-traitlets==5.14.3
 typing_extensions==4.12.2
 urllib3==2.3.0
-wcwidth==0.2.13
-webencodings==0.5.1
-yarg==0.1.9
-yarl==1.18.3
diff --git a/sample.env b/sample.env
new file mode 100644
index 0000000..6d2ba58
--- /dev/null
+++ b/sample.env
@@ -0,0 +1,3 @@
+SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d
+COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819
+WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa
\ No newline at end of file
diff --git a/src/bgl/bgl.py b/src/bgl/bgl.py
index d89a6ef..f361e49 100644
--- a/src/bgl/bgl.py
+++ b/src/bgl/bgl.py
@@ -52,7 +52,10 @@ def fetch_all_properties():
         prop_soup = BeautifulSoup(response.text, "html.parser")
 
         # get h3 with class adresse and extract the text
-        prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
+        try :
+            prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
+        except:
+            prop_title = "N/A"
 
         # create a value entrie tuple list
         facts = []
@@ -91,15 +94,23 @@ def fetch_all_properties():
         # image is in img tag with class "img-responsive"
         image_url = prop_soup.find("img", {"class": "img-responsive"})["src"]
 
+        # from prop_soup get the ifram with id "gmap_canvas" and extract the src
+        google_maps_link = prop_soup.find("iframe", {"id": "gmap_canvas"})
+        google_maps_link = google_maps_link["data-original-src"]
+        # remove the query parameter output=embed
+        google_maps_link = google_maps_link.replace("&output=embed", "")
+        # remove width and height
+        google_maps_link = google_maps_link.replace("width=300&height=220&", "")
+
         properties.append({
             "id": obj_id,
             "title": "BGL - " + prop_title,
-            "subtitle": "",
+            "subtitle": google_maps_link,
             "rooms": room_count,
             "size": size,
             "rent": cold_rent,
             "link": bgl_url + prop_url,
-            "abstract": "",
+            "abstract": "Andere Kosten: " + other_costs + " Heizkosten: " + heating_costs + " Etage:" + level,
             "warm_rent": rent,
             "availability": availability,
             "image_url": image_url,
diff --git a/src/discord/webhook.py b/src/discord/webhook.py
index 642b6de..b8412c9 100644
--- a/src/discord/webhook.py
+++ b/src/discord/webhook.py
@@ -1,11 +1,12 @@
 import json
 import requests
-
+from dotenv import load_dotenv
+import os
 import src.lwb.scrape_image as scrape_image
+load_dotenv()
 
 # Webhook URL from Discord
-WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
-
+WEBHOOK_URL = os.getenv("WEBHOOK_URL")
 
 # Funktion: Nachricht an Discord senden
 def send_to_discord(property_data):
diff --git a/src/lwb/scrape_image.py b/src/lwb/scrape_image.py
index 0ed516c..6a0147d 100644
--- a/src/lwb/scrape_image.py
+++ b/src/lwb/scrape_image.py
@@ -3,6 +3,7 @@ from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
 
 def scrape_image(url, owner):
     session = requests.Session()
+    response = None
 
     if owner == "BGL":
         response = session.get(url)
@@ -18,6 +19,12 @@ def scrape_image(url, owner):
             # return empty image
             return b''
     
-    # get image from response
+        # Handle other owners or fallback
+    if response is None:
+        response = session.get(url)
+        if response.status_code != 200:
+            print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}")
+            return b''
+        
 
     return response.content
diff --git a/src/lwb/scraper.py b/src/lwb/scraper.py
index 11b4e75..7f41ffe 100644
--- a/src/lwb/scraper.py
+++ b/src/lwb/scraper.py
@@ -2,11 +2,15 @@ import requests
 import xml.etree.ElementTree as ET
 import src.lwb.format as format
 import hashlib
+import os
+from dotenv import load_dotenv
+
+load_dotenv()
 
 SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
 
-SAP_SESSIONID = "iZ52JjFdvDRY0528vXt4y4tdOvzk1xHvhW4KELG5Agg%3d"
-COOKIE_SESSION = "678ADA670E24565B64423D923CC07C0B"
+SAP_SESSIONID = os.getenv("SAP_SESSIONID")
+COOKIE_SESSION = os.getenv("COOKIE_SESSION")
 
 EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
 EASYSQUARE_HEADERS = {