mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-07-16 03:28:48 +02:00
feat: add BGL scraper and integrate into main scraping process; update image scraping logic
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
.venv
|
.venv
|
||||||
|
*.pyc
|
15
main.py
15
main.py
@ -5,6 +5,7 @@ import src.wogetra.scraper as wogetra_scraper
|
|||||||
import src.lwb.scraper as lwb_scraper
|
import src.lwb.scraper as lwb_scraper
|
||||||
import src.discord.webhook as localwebhook
|
import src.discord.webhook as localwebhook
|
||||||
import src.lipsia.lipsia as lipsia_scraper
|
import src.lipsia.lipsia as lipsia_scraper
|
||||||
|
import src.bgl.bgl as bgl_scraper
|
||||||
|
|
||||||
# URL of the website to scrape
|
# URL of the website to scrape
|
||||||
TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
||||||
@ -26,12 +27,14 @@ def main():
|
|||||||
while True:
|
while True:
|
||||||
current_time = time.strftime("%H:%M:%S", time.localtime())
|
current_time = time.strftime("%H:%M:%S", time.localtime())
|
||||||
print("Scraping properties at " + current_time)
|
print("Scraping properties at " + current_time)
|
||||||
print("Scraping properties from Wogetra...")
|
# print("Scraping properties from Wogetra...")
|
||||||
properties = wogetra_scraper.scrape_wogetra()
|
# properties = wogetra_scraper.scrape_wogetra()
|
||||||
print("Scraping properties from LWB...")
|
# print("Scraping properties from LWB...")
|
||||||
properties += lwb_scraper.scrape_easysquare()
|
# properties += lwb_scraper.scrape_easysquare()
|
||||||
print("Scraping properties from Lipsia...")
|
# print("Scraping properties from Lipsia...")
|
||||||
properties += lipsia_scraper.scrape_lipsia()
|
# properties += lipsia_scraper.scrape_lipsia()
|
||||||
|
print("Scraping properties from BGL...")
|
||||||
|
properties = bgl_scraper.fetch_all_properties()
|
||||||
|
|
||||||
for prop in properties:
|
for prop in properties:
|
||||||
if prop["id"] not in known_properties:
|
if prop["id"] not in known_properties:
|
||||||
|
117
src/bgl/bgl.py
Normal file
117
src/bgl/bgl.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def get_element_from_facts(facts, category):
|
||||||
|
for x in facts:
|
||||||
|
if x[0] == category:
|
||||||
|
return x[1] if x[1] else "N/A"
|
||||||
|
|
||||||
|
def fetch_all_properties():
|
||||||
|
bgl_url = "https://www.bgl.de"
|
||||||
|
base_url = bgl_url + "/vermietung"
|
||||||
|
|
||||||
|
url_params = {
|
||||||
|
"address": "",
|
||||||
|
"property_type": 3,
|
||||||
|
"nroom": "",
|
||||||
|
"sqft_min": "",
|
||||||
|
"sqft_max": "",
|
||||||
|
"max_price": "",
|
||||||
|
"type_aufzug": "LIKE",
|
||||||
|
"type_balkon": "LIKE",
|
||||||
|
"sortby": "a.price",
|
||||||
|
"orderby": "asc"
|
||||||
|
}
|
||||||
|
|
||||||
|
url = f"{base_url}?{url_params}"
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, sdch, br',
|
||||||
|
'Accept-Language': 'de,en-US;q=0.7,en;q=0.3',
|
||||||
|
'Connection': 'keep-alive',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
'Cache-Control': 'max-age=0',
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.request("GET", url, headers=headers)
|
||||||
|
|
||||||
|
soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# get all divs with class "property-item z-0"
|
||||||
|
|
||||||
|
bgl_page_properties = soup.find_all("div", {"class": "property-item z-0"})
|
||||||
|
|
||||||
|
properties = []
|
||||||
|
|
||||||
|
for prop in bgl_page_properties:
|
||||||
|
# get a from prop with class "btn btn-primary details z-0 me-2 mb-2 stretched-link_" and only extract the href
|
||||||
|
prop_url = prop.find("a", {"class": "btn btn-primary details z-0 me-2 mb-2 stretched-link_"})["href"]
|
||||||
|
print(prop_url)
|
||||||
|
response = requests.request("GET", bgl_url + prop_url, headers=headers)
|
||||||
|
|
||||||
|
prop_soup = BeautifulSoup(response.text, "html.parser")
|
||||||
|
|
||||||
|
# get h3 with class adresse and extract the text
|
||||||
|
prop_title = prop_soup.find("h3", {"class": "adresse"}).text.strip()
|
||||||
|
|
||||||
|
# create a value entrie tuple list
|
||||||
|
facts = []
|
||||||
|
|
||||||
|
|
||||||
|
steckbrief = prop_soup.find("div", {"class": "infoblock px-3 pt-2"})
|
||||||
|
# get all uls with class "facts"
|
||||||
|
facts_ul = steckbrief.find_all("ul", {"class": "facts"})
|
||||||
|
# first li in ul has the room count
|
||||||
|
general_lis = facts_ul[0].find_all("li")
|
||||||
|
|
||||||
|
for li in general_lis:
|
||||||
|
value = li.find("strong").text.strip()
|
||||||
|
category = li.find("span").text.strip()
|
||||||
|
facts.append((category, value))
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
# get elements from facts that has the category "Zimmer"
|
||||||
|
room_count = get_element_from_facts(facts, "Zimmer")
|
||||||
|
size = get_element_from_facts(facts, "Wohnfläche")
|
||||||
|
level = get_element_from_facts(facts, "Etage")
|
||||||
|
obj_id = get_element_from_facts(facts, "Objektnummer")
|
||||||
|
availability = get_element_from_facts(facts, "verfügbar ab")
|
||||||
|
|
||||||
|
money_li = facts_ul[1].find_all("li")
|
||||||
|
|
||||||
|
for li in money_li:
|
||||||
|
value = li.find("strong").text.strip()
|
||||||
|
category = li.find("span").text.strip()
|
||||||
|
facts.append((category, value))
|
||||||
|
|
||||||
|
cold_rent = get_element_from_facts(facts, "Kaltmiete")
|
||||||
|
other_costs = get_element_from_facts(facts, "Nebenkosten")
|
||||||
|
heating_costs = get_element_from_facts(facts, "inkl. Heizkosten")
|
||||||
|
rent = get_element_from_facts(facts, "Miete inkl. Nebenkosten")
|
||||||
|
|
||||||
|
# image is in img tag with class "img-responsive"
|
||||||
|
image_url = prop_soup.find("img", {"class": "img-responsive"})["src"]
|
||||||
|
|
||||||
|
properties.append({
|
||||||
|
"id": obj_id,
|
||||||
|
"title": "BGL - " + prop_title,
|
||||||
|
"subtitle": "",
|
||||||
|
"rooms": room_count,
|
||||||
|
"size": size,
|
||||||
|
"rent": cold_rent,
|
||||||
|
"link": bgl_url + prop_url,
|
||||||
|
"abstract": "",
|
||||||
|
"warm_rent": rent,
|
||||||
|
"availability": availability,
|
||||||
|
"image_url": image_url,
|
||||||
|
"owner": "BGL",
|
||||||
|
})
|
||||||
|
|
||||||
|
return properties
|
@ -15,7 +15,7 @@ def send_to_discord(property_data):
|
|||||||
f"{property_data['subtitle']}\n"
|
f"{property_data['subtitle']}\n"
|
||||||
f"**Zimmer:** {property_data['rooms']}\n"
|
f"**Zimmer:** {property_data['rooms']}\n"
|
||||||
f"**Wohnfläche:** {property_data['size']}\n"
|
f"**Wohnfläche:** {property_data['size']}\n"
|
||||||
f"**Gesamtmiete:** {property_data['rent']}\n"
|
f"**Miete:** {property_data['rent']}\n"
|
||||||
f"**Warmmiete:** {property_data['warm_rent']}\n"
|
f"**Warmmiete:** {property_data['warm_rent']}\n"
|
||||||
f"**Verfügbar ab:** {property_data['availability']}\n"
|
f"**Verfügbar ab:** {property_data['availability']}\n"
|
||||||
f"**Link:** {property_data['link']}\n"
|
f"**Link:** {property_data['link']}\n"
|
||||||
@ -31,7 +31,7 @@ def send_to_discord(property_data):
|
|||||||
try:
|
try:
|
||||||
|
|
||||||
# Download the image
|
# Download the image
|
||||||
image_response = scrape_image.scrape_image(property_data["image_url"])
|
image_response = scrape_image.scrape_image(property_data["image_url"], property_data["owner"])
|
||||||
|
|
||||||
# Check if the image was downloaded successfully
|
# Check if the image was downloaded successfully
|
||||||
if image_response == b"":
|
if image_response == b"":
|
||||||
|
@ -33,6 +33,7 @@ def scrape_lipsia():
|
|||||||
"warm_rent": "", # Placeholder as warm rent is not provided
|
"warm_rent": "", # Placeholder as warm rent is not provided
|
||||||
"availability": (item.get("highlight_3", "")), # Customize as needed
|
"availability": (item.get("highlight_3", "")), # Customize as needed
|
||||||
"image_url": item.get("image", ""),
|
"image_url": item.get("image", ""),
|
||||||
|
"owner": "Lipsia",
|
||||||
})
|
})
|
||||||
|
|
||||||
return properties
|
return properties
|
@ -1,16 +1,23 @@
|
|||||||
import requests
|
import requests
|
||||||
from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
|
from src.lwb.scraper import EASYSQUARE_HEADERS, EASYSQUARE_PARAMS
|
||||||
|
|
||||||
def scrape_image(url):
|
def scrape_image(url, owner):
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
|
||||||
|
|
||||||
if response.status_code != 200:
|
if owner == "BGL":
|
||||||
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
response = session.get(url)
|
||||||
# return empty image
|
if response.status_code != 200:
|
||||||
return b''
|
print(f"Fehler beim Abrufen von BGL: {response.status_code}")
|
||||||
|
# return empty image
|
||||||
|
return b''
|
||||||
|
|
||||||
|
if owner == "LWB":
|
||||||
|
response = session.get(url, headers=EASYSQUARE_HEADERS, params=EASYSQUARE_PARAMS)
|
||||||
|
if response.status_code != 200:
|
||||||
|
print(f"Fehler beim Abrufen von Easysquare: {response.status_code}")
|
||||||
|
# return empty image
|
||||||
|
return b''
|
||||||
|
|
||||||
# get image from response
|
# get image from response
|
||||||
|
|
||||||
return response.content
|
return response.content
|
||||||
|
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import requests
|
import requests
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
import src.lwb.format as format
|
import src.lwb.format as format
|
||||||
import gzip
|
import hashlib
|
||||||
|
|
||||||
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
SESSION_CREATE_URL = "https://portal1s.easysquare.com/meinelwb/index.html?deeplink=%2FESQ_IA_REOBJ%2FESQ_VM_REOBJ_ALL"
|
||||||
|
|
||||||
@ -194,7 +194,7 @@ def scrape_easysquare():
|
|||||||
|
|
||||||
# the id should be a hash create by the title, subtitle, rooms, size, rent, availability
|
# the id should be a hash create by the title, subtitle, rooms, size, rent, availability
|
||||||
hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
|
hashID = f"{prop_title}{subtitle}{rooms}{size}{rent}{availability}"
|
||||||
id = hash(hashID)
|
id = hashlib.sha256(hashID.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
properties.append({
|
properties.append({
|
||||||
"id": id,
|
"id": id,
|
||||||
@ -208,6 +208,7 @@ def scrape_easysquare():
|
|||||||
"warm_rent": "",
|
"warm_rent": "",
|
||||||
"availability": format.format_date(availability),
|
"availability": format.format_date(availability),
|
||||||
"image_url": image_url,
|
"image_url": image_url,
|
||||||
|
"owner": "LWB",
|
||||||
})
|
})
|
||||||
|
|
||||||
return properties
|
return properties
|
Reference in New Issue
Block a user