mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-07-15 11:08:48 +02:00
add initial scraping functionality and related utilities
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
.venv
|
1
known_properties.json
Normal file
1
known_properties.json
Normal file
@ -0,0 +1 @@
|
||||
["803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "id-193-1-13", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "B4095706-A65C-F421-B02E-1D227B684B62", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "id-154-2-71", "id-105-12-78", "id-88-5-45"]
|
49
main.py
Normal file
49
main.py
Normal file
@ -0,0 +1,49 @@
|
||||
from bs4 import BeautifulSoup
|
||||
import json
|
||||
import time
|
||||
import src.wogetra.scraper as wogetra_scraper
|
||||
import src.lwb.scraper as lwb_scraper
|
||||
import src.discord.webhook as localwebhook
|
||||
|
||||
|
||||
# URL of the website to scrape
|
||||
TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
||||
|
||||
# Store known property IDs to avoid duplicate notifications
|
||||
known_properties = set()
|
||||
|
||||
# Main loop to periodically check for new listings
|
||||
def main():
|
||||
global known_properties
|
||||
|
||||
# Load known properties from file
|
||||
try:
|
||||
with open("known_properties.json", "r") as file:
|
||||
known_properties = set(json.load(file))
|
||||
except FileNotFoundError:
|
||||
print("No known properties file found. Starting fresh.")
|
||||
|
||||
while True:
|
||||
print("Scraping properties...")
|
||||
print("Scraping properties from Wogetra...")
|
||||
properties = wogetra_scraper.scrape_wogetra()
|
||||
print("Scraping properties from LWB...")
|
||||
properties += lwb_scraper.scrape_easysquare()
|
||||
|
||||
for prop in properties:
|
||||
if prop["id"] not in known_properties:
|
||||
# Notify Discord and mark as known
|
||||
localwebhook.send_to_discord(prop)
|
||||
known_properties.add(prop["id"])
|
||||
|
||||
|
||||
# save known properties to file
|
||||
with open("known_properties.json", "w") as file:
|
||||
json.dump(list(known_properties), file)
|
||||
|
||||
# Wait before checking again
|
||||
print("Waiting for the next check...")
|
||||
time.sleep(300) # Check every 5 minutes
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
0
requirements.txt
Normal file
0
requirements.txt
Normal file
BIN
src/discord/__pycache__/webhook.cpython-310.pyc
Normal file
BIN
src/discord/__pycache__/webhook.cpython-310.pyc
Normal file
Binary file not shown.
55
src/discord/webhook.py
Normal file
55
src/discord/webhook.py
Normal file
@ -0,0 +1,55 @@
|
||||
import json
|
||||
import requests
|
||||
|
||||
import src.lwb.scrape_image as scrape_image
|
||||
|
||||
# Webhook URL from Discord
|
||||
WEBHOOK_URL = "https://discord.com/api/webhooks/1327600813367099462/goqeWDyYwi13-6F0yopUzFkHVaZs01bCe-2SI8bPJLj3WNMhxLOlIYBRIGyTpSzGCSru"
|
||||
|
||||
|
||||
# Funktion: Nachricht an Discord senden
|
||||
def send_to_discord(property_data):
|
||||
|
||||
message = (
|
||||
f"**{property_data['title']}**\n"
|
||||
f"{property_data['subtitle']}\n"
|
||||
f"**Zimmer:** {property_data['rooms']}\n"
|
||||
f"**Wohnfläche:** {property_data['size']}\n"
|
||||
f"**Gesamtmiete:** {property_data['rent']}\n"
|
||||
f"**Warmmiete:** {property_data['warm_rent']}\n"
|
||||
f"**Verfügbar ab:** {property_data['availability']}\n"
|
||||
f"**Link:** {property_data['link']}\n"
|
||||
f"**Beschreibung:** {property_data['abstract']}"
|
||||
)
|
||||
|
||||
|
||||
# Set headers
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
# Check for optional image URL
|
||||
if "image_url" in property_data and property_data["image_url"]:
|
||||
try:
|
||||
|
||||
# Download the image
|
||||
image_response = scrape_image.scrape_image(property_data["image_url"])
|
||||
|
||||
# Send the message with an image attachment
|
||||
files = {"file": ("image.jpg", image_response)}
|
||||
payload = {"content": message}
|
||||
response = requests.post(WEBHOOK_URL, data=payload, files=files)
|
||||
|
||||
except requests.exceptions.RequestException as e:
|
||||
print(f"Fehler beim Herunterladen des Bildes: {e}")
|
||||
payload = {"content": message}
|
||||
response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
|
||||
else:
|
||||
# Send the message without an image
|
||||
payload = {"content": message}
|
||||
response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers)
|
||||
|
||||
# Check if the message was sent successfully when the status code is >= 200 and < 300
|
||||
if response.status_code >= 200 and response.status_code < 300:
|
||||
print(f"Benachrichtigung gesendet: {property_data['title']} - response: {response.status_code}")
|
||||
else:
|
||||
print(f"Fehler beim Senden der Benachrichtigung: {response.status_code}")
|
||||
|
BIN
src/lwb/__pycache__/format.cpython-310.pyc
Normal file
BIN
src/lwb/__pycache__/format.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/lwb/__pycache__/scrape_image.cpython-310.pyc
Normal file
BIN
src/lwb/__pycache__/scrape_image.cpython-310.pyc
Normal file
Binary file not shown.
BIN
src/lwb/__pycache__/scraper.cpython-310.pyc
Normal file
BIN
src/lwb/__pycache__/scraper.cpython-310.pyc
Normal file
Binary file not shown.
57
src/lwb/format.py
Normal file
57
src/lwb/format.py
Normal file
@ -0,0 +1,57 @@
|
||||
from datetime import datetime
|
||||
|
||||
def format_date(date):
|
||||
# Extract the date part (skip the "B" prefix)
|
||||
date_part = date[1:]
|
||||
|
||||
# Convert to a datetime object
|
||||
date_object = datetime.strptime(date_part, "%Y%m%d")
|
||||
|
||||
# Format as "day month year"
|
||||
formatted_date = date_object.strftime("%d %B %Y")
|
||||
|
||||
return formatted_date
|
||||
|
||||
def format_room(room):
|
||||
|
||||
room = room[-5:].lstrip("0")
|
||||
|
||||
room_count = int(room[:1])
|
||||
if room_count == 1:
|
||||
return (f"{room_count} Zimmer")
|
||||
else:
|
||||
return (f"{room_count} Zimmer")
|
||||
|
||||
|
||||
|
||||
def format_money(money):
|
||||
# B000000079900 -> 799,00 €
|
||||
|
||||
# Extract the amount part (skip the "B" prefix)
|
||||
amount_part = money[1:]
|
||||
|
||||
# remove leading zeros
|
||||
amount_part = amount_part.lstrip("0")
|
||||
|
||||
# Split the amount into euros and cents
|
||||
euros = amount_part[:-2]
|
||||
cents = amount_part[-2:]
|
||||
|
||||
# Combine the parts with a comma
|
||||
formatted_money = f"{euros},{cents} €"
|
||||
|
||||
return formatted_money
|
||||
|
||||
def format_roomSize(room):
|
||||
|
||||
# Extract the amount part (skip the "B" prefix)
|
||||
amount_part = room[1:]
|
||||
# remove leading zeros
|
||||
amount_part = amount_part.lstrip("0")
|
||||
# Split the amount into meters and centimeters
|
||||
meters = amount_part[:-4]
|
||||
centimeters = amount_part[-4:]
|
||||
# Combine the parts with a comma
|
||||
formatted_room = f"{meters},{centimeters} m²"
|
||||
|
||||
return formatted_room
|
35
src/lwb/scrape_image.py
Normal file
35
src/lwb/scrape_image.py
Normal file
@ -0,0 +1,35 @@
|
||||
import requests
|
||||
|
||||
EASYSQUARE_HEADERS = {
|
||||
"DNT": "1",
|
||||
"Host": "portal1s.easysquare.com",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Cookie": "SAP_SESSIONID_PP0_581=zqFIhvNbEsOs_n3cgRTIO1V7ZaLQCxHvhYgKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
|
||||
"Upgrade-Insecure-Requests": "1"
|
||||
}
|
||||
EASYSQUARE_PARAMS = {
|
||||
"application": "ESQ_IA_REOBJ",
|
||||
"sap-client": "581",
|
||||
"command": "action",
|
||||
"name": "boxlist",
|
||||
"api": "6.169",
|
||||
"head-oppc-version": "6.169.22",
|
||||
"_": "1736595414769"
|
||||
}
|
||||
|
||||
|
||||
def scrape_image(url):
|
||||
session = requests.Session()
|
||||
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||
return []
|
||||
|
||||
# get image from response
|
||||
|
||||
return response.content
|
||||
|
116
src/lwb/scraper.py
Normal file
116
src/lwb/scraper.py
Normal file
@ -0,0 +1,116 @@
|
||||
import requests
|
||||
import xml.etree.ElementTree as ET
|
||||
import src.lwb.format as format
|
||||
|
||||
EASYSQUARE_URL = "https://portal1s.easysquare.com/prorex/xmlforms"
|
||||
EASYSQUARE_HEADERS = {
|
||||
"DNT": "1",
|
||||
"Host": "portal1s.easysquare.com",
|
||||
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
||||
"Cookie": "SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d; sap-usercontext=sap-language=D&sap-client=581; cookiesession1=678ADA67ADF0813997206FE9F4133118",
|
||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
||||
"Accept-Encoding": "gzip, deflate, br, zstd",
|
||||
"Accept-Language": "de,en-US;q=0.7,en;q=0.3",
|
||||
"Upgrade-Insecure-Requests": "1"
|
||||
}
|
||||
EASYSQUARE_PARAMS = {
|
||||
"application": "ESQ_IA_REOBJ",
|
||||
"sap-client": "581",
|
||||
"command": "action",
|
||||
"name": "boxlist",
|
||||
"api": "6.169",
|
||||
"head-oppc-version": "6.169.22",
|
||||
"_": "1736761256321"
|
||||
}
|
||||
|
||||
# curl --location 'https://portal1s.easysquare.com/prorex/xmlforms?application=ESQ_IA_REOBJ&sap-client=581&command=action&name=boxlist&api=6.169&head-oppc-version=6.169.22&_=1736761255682' \
|
||||
# --header 'DNT: 1' \
|
||||
# --header 'UTC: 1736761256321' \
|
||||
# --header 'Host: portal1s.easysquare.com' \
|
||||
# --header 'host: portal1s.easysquare.com' \
|
||||
# --header 'Accept: text/plain, */*; q=0.01' \
|
||||
# --header 'Cookie: cookiesession1=678ADA67ADF0813997206FE9F4133118; sap-usercontext=sap-language=de&sap-client=581; SAP_SESSIONID_PP0_581=Vg3w4pn8whD76BldaU2wvP-YzyrRkRHvhWoKELG5Agg%3d' \
|
||||
# --header 'Referer: https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL' \
|
||||
# --header 'Sec-GPC: 1' \
|
||||
# --header 'oppc-id: D9925A2D-4ED9-4911-8AD3-2626DA41FBB0' \
|
||||
# --header 'Connection: keep-alive' \
|
||||
# --header 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0' \
|
||||
# --header 'Content-Type: text/plain;charset=UTF-8' \
|
||||
# --header 'Sec-Fetch-Dest: empty' \
|
||||
# --header 'Sec-Fetch-Mode: cors' \
|
||||
# --header 'Sec-Fetch-Site: same-origin' \
|
||||
# --header 'Accept-Encoding: gzip, deflate, br, zstd' \
|
||||
# --header 'Accept-Language: de,en-US;q=0.7,en;q=0.3' \
|
||||
# --header 'X-Requested-With: XMLHttpRequest'
|
||||
|
||||
|
||||
# Funktion: Scrape von Easysquare
|
||||
def scrape_easysquare():
|
||||
session = requests.Session()
|
||||
response = session.get(EASYSQUARE_URL, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
||||
|
||||
if response.status_code != 200:
|
||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||
return []
|
||||
|
||||
# XML-Daten parsen
|
||||
root = ET.fromstring(response.content)
|
||||
namespace = {"ns": "http://www.openpromos.com/OPPC/XMLForms"}
|
||||
|
||||
properties = []
|
||||
for head in root.findall(".//ns:head", namespace):
|
||||
prop_title = head.find("ns:title", namespace).text
|
||||
subtitle = head.find("ns:subtitle", namespace).text
|
||||
abstract = head.find("ns:abstract", namespace).text.strip()
|
||||
|
||||
# get adress lat and long
|
||||
# <address city="" lat="51.346061" lon="12.3774656" postcode="" street=""/>
|
||||
|
||||
adress = head.find("ns:address", namespace)
|
||||
lat = adress.get("lat")
|
||||
lon = adress.get("lon")
|
||||
|
||||
image = head.find("ns:image", namespace)
|
||||
iamge_resourceId = image.get("resourceId")
|
||||
|
||||
id = head.find("ns:id", namespace).text
|
||||
|
||||
# Details extrahieren
|
||||
rooms = "N/A"
|
||||
size = "N/A"
|
||||
rent = "N/A"
|
||||
availability = "N/A"
|
||||
|
||||
for criterion in head.findall(".//ns:criterion", namespace):
|
||||
criterion_title = criterion.get("title")
|
||||
value = criterion.text.strip() if criterion.text else "N/A"
|
||||
if criterion_title == "Zimmer":
|
||||
rooms = value
|
||||
elif criterion_title == "Fläche":
|
||||
size = value
|
||||
elif criterion_title == "Gesamtmiete":
|
||||
rent = value
|
||||
elif criterion_title == "Verfügbar ab":
|
||||
availability = value
|
||||
|
||||
# link create google maps link with lat and long
|
||||
link = f"https://www.google.com/maps/search/?api=1&query={lat},{lon}"
|
||||
|
||||
# https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get
|
||||
image_url = f"https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id={iamge_resourceId}&name=get"
|
||||
|
||||
properties.append({
|
||||
"id": id,
|
||||
"title": prop_title,
|
||||
"subtitle": subtitle,
|
||||
"rooms": format.format_room(rooms),
|
||||
"size": format.format_roomSize(size),
|
||||
"rent": format.format_money(rent),
|
||||
"link": link,
|
||||
"abstract": abstract,
|
||||
"warm_rent": "",
|
||||
"availability": format.format_date(availability),
|
||||
"image_url": image_url,
|
||||
})
|
||||
|
||||
return properties
|
BIN
src/wogetra/__pycache__/scraper.cpython-310.pyc
Normal file
BIN
src/wogetra/__pycache__/scraper.cpython-310.pyc
Normal file
Binary file not shown.
50
src/wogetra/scraper.py
Normal file
50
src/wogetra/scraper.py
Normal file
@ -0,0 +1,50 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
WOGETRA_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
||||
|
||||
# Funktion: Scrape von Wogetra
|
||||
def scrape_wogetra():
|
||||
response = requests.get(WOGETRA_URL)
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Find all property containers
|
||||
property_elements = soup.find_all("div", class_="property-container")
|
||||
properties = []
|
||||
|
||||
for prop in property_elements:
|
||||
# Extract property details
|
||||
title_element = prop.find("h3", class_="property-title")
|
||||
subtitle_element = prop.find("div", class_="property-subtitle")
|
||||
link_element = title_element.find("a") if title_element else None
|
||||
|
||||
details = prop.find("div", class_="property-data")
|
||||
|
||||
# Extract details (ID, rooms, size, etc.)
|
||||
property_id = prop.get("id", "")
|
||||
title = title_element.text.strip() if title_element else "No Title"
|
||||
subtitle = subtitle_element.text.strip() if subtitle_element else "No Subtitle"
|
||||
link = link_element["href"] if link_element else "#"
|
||||
|
||||
rooms = details.find("div", class_="data-anzahl_zimmer").find("div", class_="dd").text.strip() if details else "N/A"
|
||||
size = details.find("div", class_="data-wohnflaeche").find("div", class_="dd").text.strip() if details else "N/A"
|
||||
rent = details.find("div", class_="data-nettokaltmiete").find("div", class_="dd").text.strip() if details else "N/A"
|
||||
warm_rent = details.find("div", class_="data-warmmiete").find("div", class_="dd").text.strip() if details else "N/A"
|
||||
availability = details.find("div", class_="data-verfuegbar_ab").find("div", class_="dd").text.strip() if details else "N/A"
|
||||
|
||||
# Add property to list
|
||||
properties.append({
|
||||
"id": property_id,
|
||||
"title": title,
|
||||
"subtitle": subtitle,
|
||||
"rooms": rooms,
|
||||
"size": size,
|
||||
"rent": rent,
|
||||
"link": link,
|
||||
"abstract": "",
|
||||
"warm_rent": warm_rent,
|
||||
"availability": availability,
|
||||
"image_url": "",
|
||||
})
|
||||
|
||||
return properties
|
12
test.py
Normal file
12
test.py
Normal file
@ -0,0 +1,12 @@
|
||||
import src.lwb.scrape_image as scrape_image
|
||||
|
||||
# B000000502800 -> 50,28 m²
|
||||
room = "B000000502800"
|
||||
|
||||
|
||||
|
||||
iamge = scrape_image.scrape_image("https://portal1s.easysquare.com/prorex/xmlforms/image.jpg?application=ESQ_IA_REOBJ&command=action&id=1EC8D4E6-191A-A827-47FF-72D8C5379070&name=get")
|
||||
|
||||
# save image
|
||||
with open(f"image_{room}.jpg", "wb") as file:
|
||||
file.write(iamge)
|
Reference in New Issue
Block a user