diff --git a/.gitignore b/.gitignore index 1d17dae..2969886 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ .venv +*.pyc \ No newline at end of file diff --git a/main.py b/main.py index da472e5..da911af 100644 --- a/main.py +++ b/main.py @@ -5,6 +5,7 @@ import src.wogetra.scraper as wogetra_scraper import src.lwb.scraper as lwb_scraper import src.discord.webhook as localwebhook import src.lipsia.lipsia as lipsia_scraper +import src.bgl.bgl as bgl_scraper # URL of the website to scrape TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/" @@ -26,12 +27,14 @@ def main(): while True: current_time = time.strftime("%H:%M:%S", time.localtime()) print("Scraping properties at " + current_time) - print("Scraping properties from Wogetra...") - properties = wogetra_scraper.scrape_wogetra() - print("Scraping properties from LWB...") - properties += lwb_scraper.scrape_easysquare() - print("Scraping properties from Lipsia...") - properties += lipsia_scraper.scrape_lipsia() + # print("Scraping properties from Wogetra...") + # properties = wogetra_scraper.scrape_wogetra() + # print("Scraping properties from LWB...") + # properties += lwb_scraper.scrape_easysquare() + # print("Scraping properties from Lipsia...") + # properties += lipsia_scraper.scrape_lipsia() + print("Scraping properties from BGL...") + properties = bgl_scraper.fetch_all_properties() for prop in properties: if prop["id"] not in known_properties: diff --git a/src/bgl/bgl.py b/src/bgl/bgl.py new file mode 100644 index 0000000..a18ad23 --- /dev/null +++ b/src/bgl/bgl.py @@ -0,0 +1,117 @@ +import requests +from bs4 import BeautifulSoup + +def get_element_from_facts(facts, category): + for x in facts: + if x[0] == category: + return x[1] if x[1] else "N/A" + +def fetch_all_properties(): + bgl_url = "https://www.bgl.de" + base_url = bgl_url + "/vermietung" + + url_params = { + "address": "", + "property_type": 3, + "nroom": "", + "sqft_min": "", + "sqft_max": "", + "max_price": "", + "type_aufzug": "LIKE", + "type_balkon": "LIKE", + "sortby": "a.price", + "orderby": "asc" + } + + url = f"{base_url}?{url_params}" + + + + + headers = { + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Encoding': 'gzip, deflate, sdch, br', + 'Accept-Language': 'de,en-US;q=0.7,en;q=0.3', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Cache-Control': 'max-age=0', + } + + response = requests.request("GET", url, headers=headers) + + soup = BeautifulSoup(response.text, "html.parser") + + # get all divs with class "property-item z-0" + + bgl_page_properties = soup.find_all("div", {"class": "property-item z-0"}) + + properties = [] + + for prop in bgl_page_properties: + # get a from prop with class "btn btn-primary details z-0 me-2 mb-2 stretched-link_" and only extract the href + prop_url = prop.find("a", {"class": "btn btn-primary details z-0 me-2 mb-2 stretched-link_"})["href"] + print(prop_url) + response = requests.request("GET", bgl_url + prop_url, headers=headers) + + prop_soup = BeautifulSoup(response.text, "html.parser") + + # get h3 with class adresse and extract the text + prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip() + + # create a value entrie tuple list + facts = [] + + + steckbrief = prop_soup.find("div", {"class": "infoblock px-3 pt-2"}) + # get all uls with class "facts" + facts_ul = steckbrief.find_all("ul", {"class": "facts"}) + # first li in ul has the room count + general_lis = facts_ul[0].find_all("li") + + for li in general_lis: + value = li.find("strong").text.strip() + category = li.find("span").text.strip() + facts.append((category, value)) + + + + + # get elements from facts that has the category "Zimmer" + room_count = get_element_from_facts(facts, "Zimmer") + size = get_element_from_facts(facts, "Wohnfläche") + level = get_element_from_facts(facts, "Etage") + obj_id = get_element_from_facts(facts, "Objektnummer") + availability = get_element_from_facts(facts, "verfügbar ab") + + money_li = facts_ul[1].find_all("li") + + for li in money_li: + value = li.find("strong").text.strip() + category = li.find("span").text.strip() + facts.append((category, value)) + + cold_rent = get_element_from_facts(facts, "Kaltmiete") + other_costs = get_element_from_facts(facts, "Nebenkosten") + heating_costs = get_element_from_facts(facts, "inkl. Heizkosten") + rent = get_element_from_facts(facts, "Miete inkl. Nebenkosten") + + # image is in img tag with class "img-responsive" + image_url = prop_soup.find("img", {"class": "img-responsive"})["src"] + + properties.append({ + "id": obj_id, + "title": "BGL - " + prop_title, + "subtitle": "", + "rooms": room_count, + "size": size, + "rent": cold_rent, + "link": bgl_url + prop_url, + "abstract": "", + "warm_rent": rent, + "availability": availability, + "image_url": image_url, + "owner": "BGL", + }) + + return properties \ No newline at end of file diff --git a/src/discord/webhook.py b/src/discord/webhook.py index 32ce523..642b6de 100644 --- a/src/discord/webhook.py +++ b/src/discord/webhook.py @@ -15,7 +15,7 @@ def send_to_discord(property_data): f"{property_data['subtitle']}\n" f"**Zimmer:** {property_data['rooms']}\n" f"**Wohnfläche:** {property_data['size']}\n" - f"**Gesamtmiete:** {property_data['rent']}\n" + f"**Miete:** {property_data['rent']}\n" f"**Warmmiete:** {property_data['warm_rent']}\n" f"**Verfügbar ab:** {property_data['availability']}\n" f"**Link:** {property_data['link']}\n" @@ -31,7 +31,7 @@ def send_to_discord(property_data): try: # Download the image - image_response = scrape_image.scrape_image(property_data["image_url"]) + image_response = scrape_image.scrape_image(property_data["image_url"], property_data["owner"]) # Check if the image was downloaded successfully if image_response == b"": diff --git a/src/lipsia/lipsia.py b/src/lipsia/lipsia.py index 664c319..8d96d43 100644 --- a/src/lipsia/lipsia.py +++ b/src/lipsia/lipsia.py @@ -33,6 +33,7 @@ def scrape_lipsia(): "warm_rent": "", # Placeholder as warm rent is not provided "availability": (item.get("highlight_3", "")), # Customize as needed "image_url": item.get("image", ""), + "owner": "Lipsia", }) return properties \ No newline at end of file diff --git a/src/lwb/scrape_image.py b/src/lwb/scrape_image.py index 52de6b8..0ed516c 100644 --- a/src/lwb/scrape_image.py +++ b/src/lwb/scrape_image.py @@ -1,16 +1,23 @@ import requests from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS -def scrape_image(url): +def scrape_image(url, owner): session = requests.Session() - response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS) - if response.status_code != 200: - print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") - # return empty image - return b'' + if owner == "BGL": + response = session.get(url) + if response.status_code != 200: + print(f"Fehler beim Abrufen von BGL: {response.status_code}") + # return empty image + return b'' + + if owner == "LWB": + response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS) + if response.status_code != 200: + print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") + # return empty image + return b'' # get image from response return response.content - diff --git a/src/lwb/scraper.py b/src/lwb/scraper.py index b725e5c..11b4e75 100644 --- a/src/lwb/scraper.py +++ b/src/lwb/scraper.py @@ -1,7 +1,7 @@ import requests import xml.etree.ElementTree as ET import src.lwb.format as format -import gzip +import hashlib SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL" @@ -194,7 +194,7 @@ def scrape_easysquare(): # the id should be a hash create by the title, subtitle, rooms, size, rent, availability hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}" - id = hash(hashID) + id = hashlib.sha256(hashID.encode('utf-8')).hexdigest() properties.append({ "id": id, @@ -208,6 +208,7 @@ def scrape_easysquare(): "warm_rent": "", "availability": format.format_date(availability), "image_url": image_url, + "owner": "LWB", }) return properties \ No newline at end of file