add initial scraping functionality and related utilities

This commit is contained in:
Elmar Kresse
2025-01-13 10:44:01 +01:00
parent 9a0a72f640
commit b337b7c2f8
15 changed files with 376 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
.venv

1
known_properties.json Normal file
View File

@ -0,0 +1 @@
["803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "id-193-1-13", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "B4095706-A65C-F421-B02E-1D227B684B62", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "id-154-2-71", "id-105-12-78", "id-88-5-45"]

49
main.py Normal file
View File

@ -0,0 +1,49 @@
from bs4 import BeautifulSoup
import json
import time
import src.wogetra.scraper as wogetra_scraper
import src.lwb.scraper as lwb_scraper
import src.discord.webhook as localwebhook
# URL of the website to scrape
TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
# Store known property IDs to avoid duplicate notifications
known_properties = set()
# Main loop to periodically check for new listings
def main():
global known_properties
# Load known properties from file
try:
with open("known_properties.json", "r") as file:
known_properties = set(json.load(file))
except FileNotFoundError:
print("No known properties file found. Starting fresh.")
while True:
print("Scraping properties...")
print("Scraping properties from Wogetra...")
properties = wogetra_scraper.scrape_wogetra()
print("Scraping properties from LWB...")
properties += lwb_scraper.scrape_easysquare()
for prop in properties:
if prop["id"] not in known_properties:
# Notify Discord and mark as known
localwebhook.send_to_discord(prop)
known_properties.add(prop["id"])
# save known properties to file
with open("known_properties.json", "w") as file:
json.dump(list(known_properties), file)
# Wait before checking again
print("Waiting for the next check...")
time.sleep(300) # Check every 5 minutes
if __name__ == "__main__":
main()

0
requirements.txt Normal file
View File

Binary file not shown.

55
src/discord/webhook.py Normal file
View File

@ -0,0 +1,55 @@
import json
import requests
import src.lwb.scrape_image as scrape_image
# Webhook URL from Discord
WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
# Funktion: Nachricht an Discord senden
def send_to_discord(property_data):
message = (
f"**{property_data['title']}**\n"
f"{property_data['subtitle']}\n"
f"**Zimmer:** {property_data['rooms']}\n"
f"**Wohnfläche:** {property_data['size']}\n"
f"**Gesamtmiete:** {property_data['rent']}\n"
f"**Warmmiete:** {property_data['warm_rent']}\n"
f"**Verfügbar ab:** {property_data['availability']}\n"
f"**Link:** {property_data['link']}\n"
f"**Beschreibung:** {property_data['abstract']}"
)
# Set headers
headers = {"Content-Type": "application/json"}
# Check for optional image URL
if "image_url" in property_data and property_data["image_url"]:
try:
# Download the image
image_response = scrape_image.scrape_image(property_data["image_url"])
# Send the message with an image attachment
files = {"file": ("image.jpg", image_response)}
payload = {"content": message}
response = requests.post(WEBHOOK_URL, data=payload, files=files)
except requests.exceptions.RequestException as e:
print(f"Fehler beim Herunterladen des Bildes: {e}")
payload = {"content": message}
response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
else:
# Send the message without an image
payload = {"content": message}
response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
# Check if the message was sent successfully when the status code is >= 200 and < 300
if response.status_code >= 200 and response.status_code < 300:
print(f"Benachrichtigung gesendet: {property_data['title']} - response: {response.status_code}")
else:
print(f"Fehler beim Senden der Benachrichtigung: {response.status_code}")

Binary file not shown.

Binary file not shown.

Binary file not shown.

57
src/lwb/format.py Normal file
View File

@ -0,0 +1,57 @@
from datetime import datetime
def format_date(date):
# Extract the date part (skip the "B" prefix)
date_part = date[1:]
# Convert to a datetime object
date_object = datetime.strptime(date_part, "%Y%m%d")
# Format as "day month year"
formatted_date = date_object.strftime("%d %B %Y")
return formatted_date
def format_room(room):
room = room[-5:].lstrip("0")
room_count = int(room[:1])
if room_count == 1:
return (f"{room_count} Zimmer")
else:
return (f"{room_count} Zimmer")
def format_money(money):
# B000000079900 -> 799,00 €
# Extract the amount part (skip the "B" prefix)
amount_part = money[1:]
# remove leading zeros
amount_part = amount_part.lstrip("0")
# Split the amount into euros and cents
euros = amount_part[:-2]
cents = amount_part[-2:]
# Combine the parts with a comma
formatted_money = f"{euros},{cents}"
return formatted_money
def format_roomSize(room):
# Extract the amount part (skip the "B" prefix)
amount_part = room[1:]
# remove leading zeros
amount_part = amount_part.lstrip("0")
# Split the amount into meters and centimeters
meters = amount_part[:-4]
centimeters = amount_part[-4:]
# Combine the parts with a comma
formatted_room = f"{meters},{centimeters}"
return formatted_room

35
src/lwb/scrape_image.py Normal file
View File

@ -0,0 +1,35 @@
import requests
EASYSQUARE_HEADERS = {
"DNT": "1",
"Host": "portal1s.easysquare.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Cookie": "SAP_SESSIONID_PP0_581=zqFIhvNbEsOs_n3cgRTIO1V7ZaLQCxHvhYgKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
"Upgrade-Insecure-Requests": "1"
}
EASYSQUARE_PARAMS = {
"application": "ESQ_IA_REOBJ",
"sap-client": "581",
"command": "action",
"name": "boxlist",
"api": "6.169",
"head-oppc-version": "6.169.22",
"_": "1736595414769"
}
def scrape_image(url):
session = requests.Session()
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
if response.status_code != 200:
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
return []
# get image from response
return response.content

116
src/lwb/scraper.py Normal file
View File

@ -0,0 +1,116 @@
import requests
import xml.etree.ElementTree as ET
import src.lwb.format as format
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
EASYSQUARE_HEADERS = {
"DNT": "1",
"Host": "portal1s.easysquare.com",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Cookie": "SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
"Upgrade-Insecure-Requests": "1"
}
EASYSQUARE_PARAMS = {
"application": "ESQ_IA_REOBJ",
"sap-client": "581",
"command": "action",
"name": "boxlist",
"api": "6.169",
"head-oppc-version": "6.169.22",
"_": "1736761256321"
}
# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \
# --header 'DNT: 1' \
# --header 'UTC: 1736761256321' \
# --header 'Host: portal1s.easysquare.com' \
# --header 'host: portal1s.easysquare.com' \
# --header 'Accept: text/plain, */*; q=0.01' \
# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \
# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \
# --header 'Sec-GPC: 1' \
# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \
# --header 'Connection: keep-alive' \
# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \
# --header 'Content-Type: text/plain;charset=UTF-8' \
# --header 'Sec-Fetch-Dest: empty' \
# --header 'Sec-Fetch-Mode: cors' \
# --header 'Sec-Fetch-Site: same-origin' \
# --header 'Accept-Encoding: gzip, deflate, br, zstd' \
# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \
# --header 'X-Requested-With: XMLHttpRequest'
# Funktion: Scrape von Easysquare
def scrape_easysquare():
session = requests.Session()
response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
if response.status_code != 200:
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
return []
# XML-Daten parsen
root = ET.fromstring(response.content)
namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
properties = []
for head in root.findall(".//ns:head", namespace):
prop_title = head.find("ns:title", namespace).text
subtitle = head.find("ns:subtitle", namespace).text
abstract = head.find("ns:abstract", namespace).text.strip()
# get adress lat and long
# <address city="" lat="51.346061" lon="12.3774656" postcode="" street=""/>
adress = head.find("ns:address", namespace)
lat = adress.get("lat")
lon = adress.get("lon")
image = head.find("ns:image", namespace)
iamge_resourceId = image.get("resourceId")
id = head.find("ns:id", namespace).text
# Details extrahieren
rooms = "N/A"
size = "N/A"
rent = "N/A"
availability = "N/A"
for criterion in head.findall(".//ns:criterion", namespace):
criterion_title = criterion.get("title")
value = criterion.text.strip() if criterion.text else "N/A"
if criterion_title == "Zimmer":
rooms = value
elif criterion_title == "Fläche":
size = value
elif criterion_title == "Gesamtmiete":
rent = value
elif criterion_title == "Verfügbar ab":
availability = value
# link create google maps link with lat and long
link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
# https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get
image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get"
properties.append({
"id": id,
"title": prop_title,
"subtitle": subtitle,
"rooms": format.format_room(rooms),
"size": format.format_roomSize(size),
"rent": format.format_money(rent),
"link": link,
"abstract": abstract,
"warm_rent": "",
"availability": format.format_date(availability),
"image_url": image_url,
})
return properties

Binary file not shown.

50
src/wogetra/scraper.py Normal file
View File

@ -0,0 +1,50 @@
import requests
from bs4 import BeautifulSoup
WOGETRA_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
# Funktion: Scrape von Wogetra
def scrape_wogetra():
response = requests.get(WOGETRA_URL)
soup = BeautifulSoup(response.content, 'html.parser')
# Find all property containers
property_elements = soup.find_all("div", class_="property-container")
properties = []
for prop in property_elements:
# Extract property details
title_element = prop.find("h3", class_="property-title")
subtitle_element = prop.find("div", class_="property-subtitle")
link_element = title_element.find("a") if title_element else None
details = prop.find("div", class_="property-data")
# Extract details (ID, rooms, size, etc.)
property_id = prop.get("id", "")
title = title_element.text.strip() if title_element else "No Title"
subtitle = subtitle_element.text.strip() if subtitle_element else "No Subtitle"
link = link_element["href"] if link_element else "#"
rooms = details.find("div", class_="data-anzahl_zimmer").find("div", class_="dd").text.strip() if details else "N/A"
size = details.find("div", class_="data-wohnflaeche").find("div", class_="dd").text.strip() if details else "N/A"
rent = details.find("div", class_="data-nettokaltmiete").find("div", class_="dd").text.strip() if details else "N/A"
warm_rent = details.find("div", class_="data-warmmiete").find("div", class_="dd").text.strip() if details else "N/A"
availability = details.find("div", class_="data-verfuegbar_ab").find("div", class_="dd").text.strip() if details else "N/A"
# Add property to list
properties.append({
"id": property_id,
"title": title,
"subtitle": subtitle,
"rooms": rooms,
"size": size,
"rent": rent,
"link": link,
"abstract": "",
"warm_rent": warm_rent,
"availability": availability,
"image_url": "",
})
return properties

12
test.py Normal file
View File

@ -0,0 +1,12 @@
import src.lwb.scrape_image as scrape_image
# B000000502800 -> 50,28 m²
room = "B000000502800"
iamge = scrape_image.scrape_image("https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get")
# save image
with open(f"image_{room}.jpg", "wb") as file:
file.write(iamge)