From 92bb5b2e85ae07a8da1ea5066e681ac80660f862 Mon Sep 17 00:00:00 2001 From: Elmar Kresse Date: Mon, 10 Feb 2025 10:02:53 +0100 Subject: [PATCH] feat: update Docker setup, add environment variables, and enhance property scraping logic --- .gitignore | 3 ++- Dockerfile | 4 ++- docker-compose.yml | 6 ++++- known_properties.json | 2 +- main.py | 25 ++++++++++++------ requirements.txt | 56 +++-------------------------------------- sample.env | 3 +++ src/bgl/bgl.py | 17 ++++++++++--- src/discord/webhook.py | 7 +++--- src/lwb/scrape_image.py | 9 ++++++- src/lwb/scraper.py | 8 ++++-- 11 files changed, 66 insertions(+), 74 deletions(-) create mode 100644 sample.env diff --git a/.gitignore b/.gitignore index 37d5c94..07765b5 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ .venv *.pyc -__pycache__ \ No newline at end of file +__pycache__ +.env \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index c38f1f9..b5f5768 100644 --- a/Dockerfile +++ b/Dockerfile @@ -7,6 +7,8 @@ WORKDIR /app RUN pip install -r requirements.txt -COPY . /app +COPY ./main.py /app +COPY ./known_properties.json /app +COPY ./src/. /app/src CMD ["python", "-u", "/app/main.py"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 7c78c19..cbc0d9d 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,5 +1,9 @@ services: python: - image: gitlab.dit.htwk-leipzig.de:5050/ekresse/flatscraper:main + build: + context: . + dockerfile: Dockerfile container_name: flatscraper + env_file: + - .env \ No newline at end of file diff --git a/known_properties.json b/known_properties.json index c1abe17..35e856b 100644 --- a/known_properties.json +++ b/known_properties.json @@ -1 +1 @@ -[51968, "172DFAD2-7CDB-51B4-212E-E6F9C7F0601A", "FA45C0B6-813C-DE65-496A-EDD8DA3F2526", "9A0B42A2-6D9B-331A-DAA7-624A5FA49606", "F7992488-7C24-DFA9-F8B2-94DDC18E66A3", "C02892BE-F34F-5A8A-E174-4A79549DC9A9", "id-32-1-41", "21C20126-380D-9B0F-73F8-C4279897F189", "BAAEC20A-F667-FE22-6693-E4B4CA366889", "665243A8-FD34-86F8-322F-FE9B0B392083", -4432880139075606878, 6598001214168193439, 3637269161070974884, 51624, 3048296312984895144, "E7B71D28-C557-CFE4-805D-42C2793E9248", 51628, 3286076625991780524, 52087, "id-193-1-13", 51632, "id-105-12-78", "id-88-5-45", "57446DF5-CB9F-951C-A40A-4BA775DA7426", "id-19-3-19", "B4095706-A65C-F421-B02E-1D227B684B62", 9131742071665368621, "id-58-4-30", "892BD779-F186-9BD1-A97A-5783EFB6F56D", "id-154-2-71", -6815659680085605823, 51656, "id-158-2-18", 51660, 51665, 213337995196486226, "id-145-1-9", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "id-148-1-30", "B57516F9-E364-7E54-A211-527ED54388E6", "3B73B720-13F2-62A4-8829-557676725A95", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "281A7F0B-FA5A-75AE-68DE-89AFD90EEF48", "id-202-1-11", "id-147-1-1", 6615894728235425776, "id-30-5-36", "A979EBAE-EF87-FB51-152A-5453CD7DC794", -9017525000097635974, 51963] \ No newline at end of file +["b36ab45e1c5899b4c9add5b1134e731e6c97858fb7de80373df925c2246c824d", "449c368b2b7bca515840659acd6702071fa62e59917ba7a29a180d49ca19ad82", "id-58-4-30", "8901.039", "2702.004", "id-148-1-30", "7e59929357819219851c5dd0f4addb2c6d472fe5e1001b1c6a0021597d6ec4cc", "fe1245cb4d60d639f47edbf5b4ccf44c2639595723ef412d059760b3c3b075e2", "3207.006", "8ee618d09d89849e48da7a66efafe046f52c833baa08babbd98d8574578a259f", "e804d28f1dea007b5a3ff762dfdd7cc29fb1f42104ac0f15bfac74b9254174bd", "10104.013", 52087, "dc552b2bfb958b166d98425b8ee43420130a8eaea7e494da003fb1f7734cb1fb", "1809676f5b586fb5ad89ae3ec6082c45dad170be9b67c18fa887cd97df0b375c", "id-158-2-18", "9101.046"] \ No newline at end of file diff --git a/main.py b/main.py index 7263b23..a0ad3e6 100644 --- a/main.py +++ b/main.py @@ -27,14 +27,23 @@ def main(): while True: current_time = time.strftime("%H:%M:%S", time.localtime()) print("Scraping properties at " + current_time) - print("Scraping properties from Wogetra...") - properties = wogetra_scraper.scrape_wogetra() - print("Scraping properties from LWB...") - properties += lwb_scraper.scrape_easysquare() - print("Scraping properties from Lipsia...") - properties += lipsia_scraper.scrape_lipsia() - print("Scraping properties from BGL...") - properties = bgl_scraper.fetch_all_properties() + + properties_wogetra = wogetra_scraper.scrape_wogetra() + print("Scraped " + str(len(properties_wogetra)) + " properties from Wogetra") + properties = properties_wogetra + + properties_lwb = lwb_scraper.scrape_easysquare() + print("Scraped " + str(len(properties_lwb)) + " properties from LWB") + properties += properties_lwb + + properties_lipsia = lipsia_scraper.scrape_lipsia() + print("Scraped " + str(len(properties_lipsia)) + " properties from Lipsia") + properties += properties_lipsia + + properties_bgl = bgl_scraper.fetch_all_properties() + print("Scraped " + str(len(properties_bgl)) + " properties from BGL") + properties += properties_bgl + for prop in properties: if prop["id"] not in known_properties: diff --git a/requirements.txt b/requirements.txt index 6563100..7593449 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,59 +1,9 @@ -aiohappyeyeballs==2.4.4 -aiohttp==3.11.11 -aiosignal==1.3.2 -asttokens==3.0.0 -async-timeout==5.0.1 -attrs==24.3.0 -backcall==0.2.0 -beautifulsoup4==4.12.3 -bleach==6.2.0 -certifi==2024.12.14 +beautifulsoup4==4.13.3 +certifi==2025.1.31 charset-normalizer==3.4.1 -colorama==0.4.6 -decorator==5.1.1 -defusedxml==0.7.1 -discord.py==2.4.0 -docopt==0.6.2 -executing==2.1.0 -fastjsonschema==2.21.1 -frozenlist==1.5.0 idna==3.10 -ipython==8.12.3 -jedi==0.19.2 -Jinja2==3.1.5 -jsonschema==4.23.0 -jsonschema-specifications==2024.10.1 -MarkupSafe==3.0.2 -matplotlib-inline==0.1.7 -mistune==3.1.0 -multidict==6.1.0 -nbclient==0.10.2 -nbconvert==7.16.5 -nbformat==5.10.4 -packaging==24.2 -pandocfilters==1.5.1 -parso==0.8.4 -pickleshare==0.7.5 -platformdirs==4.3.6 -prompt_toolkit==3.0.48 -propcache==0.2.1 -pure_eval==0.2.3 -Pygments==2.19.1 -python-dateutil==2.9.0.post0 -pyzmq==26.2.0 -referencing==0.35.1 +python-dotenv==1.0.1 requests==2.32.3 -rpds-py==0.22.3 -schedule==1.2.2 -six==1.17.0 soupsieve==2.6 -stack-data==0.6.3 -tinycss2==1.4.0 -tornado==6.4.2 -traitlets==5.14.3 typing_extensions==4.12.2 urllib3==2.3.0 -wcwidth==0.2.13 -webencodings==0.5.1 -yarg==0.1.9 -yarl==1.18.3 diff --git a/sample.env b/sample.env new file mode 100644 index 0000000..6d2ba58 --- /dev/null +++ b/sample.env @@ -0,0 +1,3 @@ +SAP_SESSIONID=UrN6nRbjuCBe4dkLw7vkJLcpV5zniRHvhkwAAKG5Agg%3d +COOKIE_SESSION=678ADA67ADF0813997206FE9F4132819 +WEBHOOK_URL=https://discord.com/api/webhooks/1327600813367432462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzLKAaa \ No newline at end of file diff --git a/src/bgl/bgl.py b/src/bgl/bgl.py index d89a6ef..f361e49 100644 --- a/src/bgl/bgl.py +++ b/src/bgl/bgl.py @@ -52,7 +52,10 @@ def fetch_all_properties(): prop_soup = BeautifulSoup(response.text, "html.parser") # get h3 with class adresse and extract the text - prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip() + try : + prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip() + except: + prop_title = "N/A" # create a value entrie tuple list facts = [] @@ -91,15 +94,23 @@ def fetch_all_properties(): # image is in img tag with class "img-responsive" image_url = prop_soup.find("img", {"class": "img-responsive"})["src"] + # from prop_soup get the ifram with id "gmap_canvas" and extract the src + google_maps_link = prop_soup.find("iframe", {"id": "gmap_canvas"}) + google_maps_link = google_maps_link["data-original-src"] + # remove the query parameter output=embed + google_maps_link = google_maps_link.replace("&output=embed", "") + # remove width and height + google_maps_link = google_maps_link.replace("width=300&height=220&", "") + properties.append({ "id": obj_id, "title": "BGL - " + prop_title, - "subtitle": "", + "subtitle": google_maps_link, "rooms": room_count, "size": size, "rent": cold_rent, "link": bgl_url + prop_url, - "abstract": "", + "abstract": "Andere Kosten: " + other_costs + " Heizkosten: " + heating_costs + " Etage:" + level, "warm_rent": rent, "availability": availability, "image_url": image_url, diff --git a/src/discord/webhook.py b/src/discord/webhook.py index 642b6de..b8412c9 100644 --- a/src/discord/webhook.py +++ b/src/discord/webhook.py @@ -1,11 +1,12 @@ import json import requests - +from dotenv import load_dotenv +import os import src.lwb.scrape_image as scrape_image +load_dotenv() # Webhook URL from Discord -WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru" - +WEBHOOK_URL = os.getenv("WEBHOOK_URL") # Funktion: Nachricht an Discord senden def send_to_discord(property_data): diff --git a/src/lwb/scrape_image.py b/src/lwb/scrape_image.py index 0ed516c..6a0147d 100644 --- a/src/lwb/scrape_image.py +++ b/src/lwb/scrape_image.py @@ -3,6 +3,7 @@ from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS def scrape_image(url, owner): session = requests.Session() + response = None if owner == "BGL": response = session.get(url) @@ -18,6 +19,12 @@ def scrape_image(url, owner): # return empty image return b'' - # get image from response + # Handle other owners or fallback + if response is None: + response = session.get(url) + if response.status_code != 200: + print(f"Fehler beim Abrufen der Standardquelle: {response.status_code}") + return b'' + return response.content diff --git a/src/lwb/scraper.py b/src/lwb/scraper.py index 11b4e75..7f41ffe 100644 --- a/src/lwb/scraper.py +++ b/src/lwb/scraper.py @@ -2,11 +2,15 @@ import requests import xml.etree.ElementTree as ET import src.lwb.format as format import hashlib +import os +from dotenv import load_dotenv + +load_dotenv() SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL" -SAP_SESSIONID = "iZ52JjFdvDRY0528vXt4y4tdOvzk1xHvhW4KELG5Agg%3d" -COOKIE_SESSION = "678ADA670E24565B64423D923CC07C0B" +SAP_SESSIONID = os.getenv("SAP_SESSIONID") +COOKIE_SESSION = os.getenv("COOKIE_SESSION") EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms" EASYSQUARE_HEADERS = {