feat: add BGL scraper and integrate into main scraping process; update image scraping logic

2025-08-30 18:13:51 +02:00 · 2025-02-09 19:33:53 +01:00
parent 010a3249cf
commit f1c9816d9e
7 changed files with 147 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
 .venv
+*.pyc
--- a/main.py
+++ b/main.py
@@ -5,6 +5,7 @@ import src.wogetra.scraper as wogetra_scraper
 import src.lwb.scraper as lwb_scraper
 import src.discord.webhook as localwebhook
 import src.lipsia.lipsia as lipsia_scraper
+import src.bgl.bgl as bgl_scraper

 # URL of the website to scrape
 TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
@@ -26,12 +27,14 @@ def main():
    while True:
        current_time = time.strftime("%H:%M:%S", time.localtime())
        print("Scraping properties at " + current_time)
-        print("Scraping properties from Wogetra...")
-        properties = wogetra_scraper.scrape_wogetra()
-        print("Scraping properties from LWB...")
-        properties += lwb_scraper.scrape_easysquare()
-        print("Scraping properties from Lipsia...")
-        properties += lipsia_scraper.scrape_lipsia()
+        # print("Scraping properties from Wogetra...")
+        # properties = wogetra_scraper.scrape_wogetra()
+        # print("Scraping properties from LWB...")
+        # properties += lwb_scraper.scrape_easysquare()
+        # print("Scraping properties from Lipsia...")
+        # properties += lipsia_scraper.scrape_lipsia()
+        print("Scraping properties from BGL...")
+        properties = bgl_scraper.fetch_all_properties()

        for prop in properties:
            if prop["id"] not in known_properties:
--- a/src/bgl/bgl.py
+++ b/src/bgl/bgl.py
@@ -0,0 +1,117 @@
+import requests
+from bs4 import BeautifulSoup
+
+def get_element_from_facts(facts, category):
+    for x in facts:
+        if x[0] == category:
+            return x[1] if x[1] else "N/A"
+
+def fetch_all_properties():
+    bgl_url = "https://www.bgl.de"
+    base_url = bgl_url + "/vermietung"
+
+    url_params = {
+        "address": "",
+        "property_type": 3,
+        "nroom": "",
+        "sqft_min": "",
+        "sqft_max": "",
+        "max_price": "",
+        "type_aufzug": "LIKE",
+        "type_balkon": "LIKE",
+        "sortby": "a.price",
+        "orderby": "asc"
+    }
+
+    url = f"{base_url}?{url_params}"
+
+
+
+
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
+        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Encoding': 'gzip, deflate, sdch, br',
+        'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
+        'Connection': 'keep-alive',
+        'Upgrade-Insecure-Requests': '1',
+        'Cache-Control': 'max-age=0',
+    }
+
+    response = requests.request("GET", url, headers=headers)
+
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    # get all divs with class "property-item z-0"
+
+    bgl_page_properties = soup.find_all("div", {"class": "property-item z-0"})
+
+    properties = []
+
+    for prop in bgl_page_properties:
+        # get a from prop with class "btn btn-primary details z-0 me-2 mb-2 stretched-link_" and only extract the href
+        prop_url = prop.find("a", {"class": "btn btn-primary details z-0 me-2 mb-2 stretched-link_"})["href"]
+        print(prop_url)
+        response = requests.request("GET", bgl_url + prop_url, headers=headers)
+
+        prop_soup = BeautifulSoup(response.text, "html.parser")
+
+        # get h3 with class adresse and extract the text
+        prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
+
+        # create a value entrie tuple list
+        facts = []
+
+
+        steckbrief = prop_soup.find("div", {"class": "infoblock px-3 pt-2"})
+        # get all uls with class "facts"
+        facts_ul = steckbrief.find_all("ul", {"class": "facts"})
+        # first li in ul has the room count
+        general_lis = facts_ul[0].find_all("li")
+
+        for li in general_lis:
+            value = li.find("strong").text.strip()
+            category = li.find("span").text.strip()
+            facts.append((category, value))
+
+
+
+
+        # get elements from facts that has the category "Zimmer"
+        room_count = get_element_from_facts(facts, "Zimmer")
+        size =  get_element_from_facts(facts, "Wohnfläche")
+        level = get_element_from_facts(facts, "Etage")
+        obj_id = get_element_from_facts(facts, "Objektnummer")
+        availability = get_element_from_facts(facts, "verfügbar ab")
+
+        money_li = facts_ul[1].find_all("li")
+
+        for li in money_li:
+            value = li.find("strong").text.strip()
+            category = li.find("span").text.strip()
+            facts.append((category, value))
+
+        cold_rent = get_element_from_facts(facts, "Kaltmiete")
+        other_costs = get_element_from_facts(facts, "Nebenkosten")
+        heating_costs = get_element_from_facts(facts, "inkl. Heizkosten")
+        rent = get_element_from_facts(facts, "Miete inkl. Nebenkosten")
+
+        # image is in img tag with class "img-responsive"
+        image_url = prop_soup.find("img", {"class": "img-responsive"})["src"]
+
+        properties.append({
+            "id": obj_id,
+            "title": "BGL - " + prop_title,
+            "subtitle": "",
+            "rooms": room_count,
+            "size": size,
+            "rent": cold_rent,
+            "link": bgl_url + prop_url,
+            "abstract": "",
+            "warm_rent": rent,
+            "availability": availability,
+            "image_url": image_url,
+            "owner": "BGL",
+        })
+
+    return properties
--- a/src/discord/webhook.py
+++ b/src/discord/webhook.py
@@ -15,7 +15,7 @@ def send_to_discord(property_data):
        f"{property_data['subtitle']}\n"
        f"**Zimmer:** {property_data['rooms']}\n"
        f"**Wohnfläche:** {property_data['size']}\n"
-        f"**Gesamtmiete:** {property_data['rent']}\n"
+        f"**Miete:** {property_data['rent']}\n"
        f"**Warmmiete:** {property_data['warm_rent']}\n"
        f"**Verfügbar ab:** {property_data['availability']}\n"
        f"**Link:** {property_data['link']}\n"
@@ -31,7 +31,7 @@ def send_to_discord(property_data):
        try:
            
            # Download the image
-            image_response = scrape_image.scrape_image(property_data["image_url"])
+            image_response = scrape_image.scrape_image(property_data["image_url"], property_data["owner"])

            # Check if the image was downloaded successfully
            if image_response == b"":
--- a/src/lipsia/lipsia.py
+++ b/src/lipsia/lipsia.py
@@ -33,6 +33,7 @@ def scrape_lipsia():
            "warm_rent": "",  # Placeholder as warm rent is not provided
            "availability": (item.get("highlight_3", "")),  # Customize as needed
            "image_url": item.get("image", ""),
+            "owner": "Lipsia",
        })

    return properties
--- a/src/lwb/scrape_image.py
+++ b/src/lwb/scrape_image.py
@@ -1,16 +1,23 @@
 import requests
 from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS

-def scrape_image(url):
+def scrape_image(url, owner):
    session = requests.Session()
-    response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)

-    if response.status_code != 200:
-        print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
-        # return empty image
-        return b''
+    if owner == "BGL":
+        response = session.get(url)
+        if response.status_code != 200:
+            print(f"Fehler beim Abrufen von BGL: {response.status_code}")
+            # return empty image
+            return b''
+
+    if owner == "LWB":  
+        response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
+        if response.status_code != 200:
+            print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
+            # return empty image
+            return b''
    
    # get image from response

    return response.content
-
--- a/src/lwb/scraper.py
+++ b/src/lwb/scraper.py
@@ -1,7 +1,7 @@
 import requests
 import xml.etree.ElementTree as ET
 import src.lwb.format as format
-import gzip
+import hashlib

 SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"

@@ -194,7 +194,7 @@ def scrape_easysquare():

        # the id should be a hash create by the title, subtitle, rooms, size, rent, availability
        hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
-        id = hash(hashID)
+        id = hashlib.sha256(hashID.encode('utf-8')).hexdigest()

        properties.append({
            "id": id,
@@ -208,6 +208,7 @@ def scrape_easysquare():
            "warm_rent": "",
            "availability": format.format_date(availability),
            "image_url": image_url,
+            "owner": "LWB",
        })

    return properties