add initial scraping functionality and related utilities

2025-07-15 11:08:48 +02:00 · 2025-01-13 10:44:01 +01:00
parent 9a0a72f640
commit b337b7c2f8
15 changed files with 376 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+.venv
--- a/known_properties.json
+++ b/known_properties.json
@ -0,0 +1 @@
+["803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "id-193-1-13", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "B4095706-A65C-F421-B02E-1D227B684B62", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "id-154-2-71", "id-105-12-78", "id-88-5-45"]
--- a/main.py
+++ b/main.py
@ -0,0 +1,49 @@
+from bs4 import BeautifulSoup
+import json
+import time
+import src.wogetra.scraper as wogetra_scraper
+import src.lwb.scraper as lwb_scraper
+import src.discord.webhook as localwebhook
+
+
+# URL of the website to scrape
+TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
+
+# Store known property IDs to avoid duplicate notifications
+known_properties = set()
+
+# Main loop to periodically check for new listings
+def main():
+    global known_properties
+
+    # Load known properties from file
+    try:
+        with open("known_properties.json", "r") as file:
+            known_properties = set(json.load(file))
+    except FileNotFoundError:
+        print("No known properties file found. Starting fresh.")
+
+    while True:
+        print("Scraping properties...")
+        print("Scraping properties from Wogetra...")
+        properties = wogetra_scraper.scrape_wogetra()
+        print("Scraping properties from LWB...")
+        properties += lwb_scraper.scrape_easysquare()
+
+        for prop in properties:
+            if prop["id"] not in known_properties:
+                # Notify Discord and mark as known
+                localwebhook.send_to_discord(prop)
+                known_properties.add(prop["id"])
+
+
+        # save known properties to file
+        with open("known_properties.json", "w") as file:
+            json.dump(list(known_properties), file)
+
+        # Wait before checking again
+        print("Waiting for the next check...")
+        time.sleep(300)  # Check every 5 minutes
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
--- a/src/discord/pycache/webhook.cpython-310.pyc
+++ b/src/discord/pycache/webhook.cpython-310.pyc
--- a/src/discord/webhook.py
+++ b/src/discord/webhook.py
@ -0,0 +1,55 @@
+import json
+import requests
+
+import src.lwb.scrape_image as scrape_image
+
+# Webhook URL from Discord
+WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
+
+
+# Funktion: Nachricht an Discord senden
+def send_to_discord(property_data):
+
+    message = (
+        f"**{property_data['title']}**\n"
+        f"{property_data['subtitle']}\n"
+        f"**Zimmer:** {property_data['rooms']}\n"
+        f"**Wohnfläche:** {property_data['size']}\n"
+        f"**Gesamtmiete:** {property_data['rent']}\n"
+        f"**Warmmiete:** {property_data['warm_rent']}\n"
+        f"**Verfügbar ab:** {property_data['availability']}\n"
+        f"**Link:** {property_data['link']}\n"
+        f"**Beschreibung:** {property_data['abstract']}"
+    )
+
+
+    # Set headers
+    headers = {"Content-Type": "application/json"}
+    
+    # Check for optional image URL
+    if "image_url" in property_data and property_data["image_url"]:
+        try:
+            
+            # Download the image
+            image_response = scrape_image.scrape_image(property_data["image_url"])
+
+            # Send the message with an image attachment
+            files = {"file": ("image.jpg", image_response)}
+            payload = {"content": message}
+            response = requests.post(WEBHOOK_URL, data=payload, files=files)
+
+        except requests.exceptions.RequestException as e:
+            print(f"Fehler beim Herunterladen des Bildes: {e}")
+            payload = {"content": message}
+            response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
+    else:
+        # Send the message without an image
+        payload = {"content": message}
+        response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
+
+    # Check if the message was sent successfully when the status code is >= 200 and < 300
+    if response.status_code >= 200 and response.status_code < 300:
+        print(f"Benachrichtigung gesendet: {property_data['title']} - response: {response.status_code}")
+    else:
+        print(f"Fehler beim Senden der Benachrichtigung: {response.status_code}")
+
--- a/src/lwb/pycache/format.cpython-310.pyc
+++ b/src/lwb/pycache/format.cpython-310.pyc
--- a/src/lwb/pycache/scrape_image.cpython-310.pyc
+++ b/src/lwb/pycache/scrape_image.cpython-310.pyc
--- a/src/lwb/pycache/scraper.cpython-310.pyc
+++ b/src/lwb/pycache/scraper.cpython-310.pyc
--- a/src/lwb/format.py
+++ b/src/lwb/format.py
@ -0,0 +1,57 @@
+from datetime import datetime
+
+def format_date(date):
+    # Extract the date part (skip the "B" prefix)
+    date_part = date[1:]
+
+    # Convert to a datetime object
+    date_object = datetime.strptime(date_part, "%Y%m%d")
+
+    # Format as "day month year"
+    formatted_date = date_object.strftime("%d %B %Y")
+
+    return formatted_date
+
+def format_room(room):
+
+    room = room[-5:].lstrip("0")
+
+    room_count = int(room[:1])
+    if room_count == 1:
+        return (f"{room_count} Zimmer")
+    else:
+        return (f"{room_count} Zimmer")
+
+
+
+def format_money(money):
+    # B000000079900 -> 799,00 €
+    
+    # Extract the amount part (skip the "B" prefix)
+    amount_part = money[1:]
+
+    # remove leading zeros
+    amount_part = amount_part.lstrip("0")
+
+    # Split the amount into euros and cents
+    euros = amount_part[:-2]
+    cents = amount_part[-2:]
+
+    # Combine the parts with a comma
+    formatted_money = f"{euros},{cents} €"
+
+    return formatted_money
+
+def format_roomSize(room):
+
+    # Extract the amount part (skip the "B" prefix)
+    amount_part = room[1:]
+    # remove leading zeros
+    amount_part = amount_part.lstrip("0")
+    # Split the amount into meters and centimeters
+    meters = amount_part[:-4]
+    centimeters = amount_part[-4:]
+    # Combine the parts with a comma
+    formatted_room = f"{meters},{centimeters} m²"
+
+    return formatted_room
--- a/src/lwb/scrape_image.py
+++ b/src/lwb/scrape_image.py
@ -0,0 +1,35 @@
+import requests
+
+EASYSQUARE_HEADERS = {
+    "DNT": "1",
+    "Host": "portal1s.easysquare.com",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Cookie": "SAP_SESSIONID_PP0_581=zqFIhvNbEsOs_n3cgRTIO1V7ZaLQCxHvhYgKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
+    "Accept-Encoding": "gzip, deflate, br, zstd",
+    "Accept-Language": "de,en-US;q=0.7,en;q=0.3",
+    "Upgrade-Insecure-Requests": "1"
+}
+EASYSQUARE_PARAMS = {
+    "application": "ESQ_IA_REOBJ",
+    "sap-client": "581",
+    "command": "action",
+    "name": "boxlist",
+    "api": "6.169",
+    "head-oppc-version": "6.169.22",
+    "_": "1736595414769"
+}
+
+
+def scrape_image(url):
+    session = requests.Session()
+    response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
+
+    if response.status_code != 200:
+        print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
+        return []
+    
+    # get image from response
+
+    return response.content
+
--- a/src/lwb/scraper.py
+++ b/src/lwb/scraper.py
@ -0,0 +1,116 @@
+import requests
+import xml.etree.ElementTree as ET
+import src.lwb.format as format
+
+EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
+EASYSQUARE_HEADERS = {
+    "DNT": "1",
+    "Host": "portal1s.easysquare.com",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Cookie": "SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
+    "Accept-Encoding": "gzip, deflate, br, zstd",
+    "Accept-Language": "de,en-US;q=0.7,en;q=0.3",
+    "Upgrade-Insecure-Requests": "1"
+}
+EASYSQUARE_PARAMS = {
+    "application": "ESQ_IA_REOBJ",
+    "sap-client": "581",
+    "command": "action",
+    "name": "boxlist",
+    "api": "6.169",
+    "head-oppc-version": "6.169.22",
+    "_": "1736761256321"
+}
+
+# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \
+# --header 'DNT: 1' \
+# --header 'UTC: 1736761256321' \
+# --header 'Host: portal1s.easysquare.com' \
+# --header 'host: portal1s.easysquare.com' \
+# --header 'Accept: text/plain, */*; q=0.01' \
+# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \
+# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \
+# --header 'Sec-GPC: 1' \
+# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \
+# --header 'Connection: keep-alive' \
+# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \
+# --header 'Content-Type: text/plain;charset=UTF-8' \
+# --header 'Sec-Fetch-Dest: empty' \
+# --header 'Sec-Fetch-Mode: cors' \
+# --header 'Sec-Fetch-Site: same-origin' \
+# --header 'Accept-Encoding: gzip, deflate, br, zstd' \
+# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \
+# --header 'X-Requested-With: XMLHttpRequest'
+
+
+# Funktion: Scrape von Easysquare
+def scrape_easysquare():
+    session = requests.Session()
+    response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
+    
+    if response.status_code != 200:
+        print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
+        return []
+
+    # XML-Daten parsen
+    root = ET.fromstring(response.content)
+    namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
+    
+    properties = []
+    for head in root.findall(".//ns:head", namespace):
+        prop_title = head.find("ns:title", namespace).text
+        subtitle = head.find("ns:subtitle", namespace).text
+        abstract = head.find("ns:abstract", namespace).text.strip()
+
+        # get adress lat and long
+        # <address city="" lat="51.346061" lon="12.3774656" postcode="" street=""/>
+
+        adress = head.find("ns:address", namespace)
+        lat = adress.get("lat")
+        lon = adress.get("lon")
+
+        image = head.find("ns:image", namespace)
+        iamge_resourceId = image.get("resourceId")
+
+        id = head.find("ns:id", namespace).text
+
+        # Details extrahieren
+        rooms = "N/A"
+        size = "N/A"
+        rent = "N/A"
+        availability = "N/A"
+
+        for criterion in head.findall(".//ns:criterion", namespace):
+            criterion_title = criterion.get("title")
+            value = criterion.text.strip() if criterion.text else "N/A"
+            if criterion_title == "Zimmer":
+                rooms = value
+            elif criterion_title == "Fläche":
+                size = value
+            elif criterion_title == "Gesamtmiete":
+                rent = value
+            elif criterion_title == "Verfügbar ab":
+                availability = value
+
+        # link create google maps link with lat and long
+        link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
+
+        # https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get
+        image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get"
+
+        properties.append({
+            "id": id,
+            "title": prop_title,
+            "subtitle": subtitle,
+            "rooms": format.format_room(rooms),
+            "size": format.format_roomSize(size),
+            "rent": format.format_money(rent),
+            "link": link,
+            "abstract": abstract,
+            "warm_rent": "",
+            "availability": format.format_date(availability),
+            "image_url": image_url,
+        })
+
+    return properties
--- a/src/wogetra/pycache/scraper.cpython-310.pyc
+++ b/src/wogetra/pycache/scraper.cpython-310.pyc
--- a/src/wogetra/scraper.py
+++ b/src/wogetra/scraper.py
@ -0,0 +1,50 @@
+import requests
+from bs4 import BeautifulSoup
+
+WOGETRA_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
+
+# Funktion: Scrape von Wogetra
+def scrape_wogetra():
+    response = requests.get(WOGETRA_URL)
+    soup = BeautifulSoup(response.content, 'html.parser')
+
+    # Find all property containers
+    property_elements = soup.find_all("div", class_="property-container")
+    properties = []
+
+    for prop in property_elements:
+        # Extract property details
+        title_element = prop.find("h3", class_="property-title")
+        subtitle_element = prop.find("div", class_="property-subtitle")
+        link_element = title_element.find("a") if title_element else None
+
+        details = prop.find("div", class_="property-data")
+
+        # Extract details (ID, rooms, size, etc.)
+        property_id = prop.get("id", "")
+        title = title_element.text.strip() if title_element else "No Title"
+        subtitle = subtitle_element.text.strip() if subtitle_element else "No Subtitle"
+        link = link_element["href"] if link_element else "#"
+
+        rooms = details.find("div", class_="data-anzahl_zimmer").find("div", class_="dd").text.strip() if details else "N/A"
+        size = details.find("div", class_="data-wohnflaeche").find("div", class_="dd").text.strip() if details else "N/A"
+        rent = details.find("div", class_="data-nettokaltmiete").find("div", class_="dd").text.strip() if details else "N/A"
+        warm_rent = details.find("div", class_="data-warmmiete").find("div", class_="dd").text.strip() if details else "N/A"
+        availability = details.find("div", class_="data-verfuegbar_ab").find("div", class_="dd").text.strip() if details else "N/A"
+
+        # Add property to list
+        properties.append({
+            "id": property_id,
+            "title": title,
+            "subtitle": subtitle,
+            "rooms": rooms,
+            "size": size,
+            "rent": rent,
+            "link": link,
+            "abstract": "",
+            "warm_rent": warm_rent,
+            "availability": availability,
+            "image_url": "",
+        })
+
+    return properties
--- a/test.py
+++ b/test.py
@ -0,0 +1,12 @@
+import src.lwb.scrape_image as scrape_image
+
+# B000000502800 -> 50,28 m²
+room = "B000000502800"
+
+
+
+iamge = scrape_image.scrape_image("https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get")
+
+# save image
+with open(f"image_{room}.jpg", "wb") as file:
+    file.write(iamge)
				`@ -0,0 +1 @@`
				`["803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "id-193-1-13", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "B4095706-A65C-F421-B02E-1D227B684B62", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "id-154-2-71", "id-105-12-78", "id-88-5-45"]`