mirror of
https://gitlab.dit.htwk-leipzig.de/fsr-im/tools/flatscraper.git
synced 2025-07-16 11:38:49 +02:00
feat: add VLW property scraping functionality to main process
This commit is contained in:
5
main.py
5
main.py
@ -6,6 +6,7 @@ import src.lwb.scraper as lwb_scraper
|
|||||||
import src.discord.webhook as localwebhook
|
import src.discord.webhook as localwebhook
|
||||||
import src.lipsia.lipsia as lipsia_scraper
|
import src.lipsia.lipsia as lipsia_scraper
|
||||||
import src.bgl.bgl as bgl_scraper
|
import src.bgl.bgl as bgl_scraper
|
||||||
|
import src.vlw.scraper as vlw_scraper
|
||||||
|
|
||||||
# URL of the website to scrape
|
# URL of the website to scrape
|
||||||
TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/"
|
||||||
@ -44,6 +45,10 @@ def main():
|
|||||||
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
|
print("Scraped " + str(len(properties_bgl)) + " properties from BGL")
|
||||||
properties += properties_bgl
|
properties += properties_bgl
|
||||||
|
|
||||||
|
properties_vlw = vlw_scraper.scrape_vlw()
|
||||||
|
print("Scraped " + str(len(properties_vlw)) + " properties from VLW")
|
||||||
|
properties += properties_vlw
|
||||||
|
|
||||||
|
|
||||||
for prop in properties:
|
for prop in properties:
|
||||||
if prop["id"] not in known_properties:
|
if prop["id"] not in known_properties:
|
||||||
|
77
src/vlw/scraper.py
Normal file
77
src/vlw/scraper.py
Normal file
@ -0,0 +1,77 @@
|
|||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
def scrape_vlw():
|
||||||
|
# Scrape the VLW website
|
||||||
|
# https://vlw-eg.de/suchergebnisse?search-price-min=&search-price-max=&search-size-from=&search-size-to=&search-room-min=&search-room-max=&send=suchen
|
||||||
|
|
||||||
|
url = "https://vlw-eg.de/suchergebnisse"
|
||||||
|
parameter = {
|
||||||
|
"search-price-min": "",
|
||||||
|
"search-price-max": "",
|
||||||
|
"search-size-from": "",
|
||||||
|
"search-size-to": "",
|
||||||
|
"search-room-min": "",
|
||||||
|
"search-room-max": "",
|
||||||
|
"senden": "suchen",
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(url=url, params=parameter)
|
||||||
|
soup = BeautifulSoup(response.content, 'html.parser')
|
||||||
|
|
||||||
|
|
||||||
|
properties = []
|
||||||
|
|
||||||
|
# get div with class "estate-result-list"
|
||||||
|
|
||||||
|
estate_result_list = soup.find("div", class_="estate-result-list")
|
||||||
|
|
||||||
|
# get child div with class "estate-item no-border"
|
||||||
|
|
||||||
|
estate_items = estate_result_list.find_all("div", class_="estate-item no-border")
|
||||||
|
|
||||||
|
for estate in estate_items:
|
||||||
|
# <div class="image-wrapper" style="background-image: url(' income/actual/new/42da0fdb1bcaed578d2256f1a0599bf6.jpg ');">
|
||||||
|
image_url = estate.find("div", class_="image-wrapper")["style"].split("'")[1]
|
||||||
|
|
||||||
|
# title <h4 class="heading_h4">3-Raumwohnung sucht Nachmieter – Großartiger Weitblick inklusive!!</h4>
|
||||||
|
title = estate.find("h4", class_="heading_h4").text
|
||||||
|
|
||||||
|
# addres <p class="size"><i class="fa fa-map-marker"></i> Teichstr. 14</p>
|
||||||
|
subtitle = estate.find("p", class_="size").text
|
||||||
|
|
||||||
|
# <p class="size"><i class="fa fa-signal"></i> 61 m²</p>
|
||||||
|
size = estate.find("p", class_="size").text
|
||||||
|
|
||||||
|
# rooms <p class="rooms"><i class="fa fa-home"></i> 3 Zimmer</p>
|
||||||
|
rooms = estate.find("p", class_="rooms").text
|
||||||
|
|
||||||
|
# <p class="price">682 € warm</p>
|
||||||
|
warm_rent = estate.find("p", class_="price").text
|
||||||
|
|
||||||
|
# availability <p class="date"><i class="fa fa-calendar"></i> 01.03.2025</p>
|
||||||
|
availability = estate.find("p", class_="date").text
|
||||||
|
|
||||||
|
# link and id <a class="link link-typ-2" href="https://vlw-eg.de/suchergebnisse?objekt_id=333111" title="Details und Kontakt">Details und Kontakt</a>
|
||||||
|
link = estate.find("a", class_="link link-typ-2")["href"]
|
||||||
|
property_id = link.split("=")[1]
|
||||||
|
|
||||||
|
hashID = f"{title}{subtitle}{rooms}{size}{warm_rent}{availability}"
|
||||||
|
id = hashlib.sha256(hashID.encode('utf-8')).hexdigest()
|
||||||
|
|
||||||
|
properties.append({
|
||||||
|
"id": property_id,
|
||||||
|
"title": "Wogetra - "+ title,
|
||||||
|
"subtitle": subtitle,
|
||||||
|
"rooms": rooms,
|
||||||
|
"size": size,
|
||||||
|
"rent": "",
|
||||||
|
"link": link,
|
||||||
|
"abstract": "",
|
||||||
|
"warm_rent": warm_rent,
|
||||||
|
"availability": availability,
|
||||||
|
"image_url": image_url,
|
||||||
|
})
|
||||||
|
|
||||||
|
return properties
|
Reference in New Issue
Block a user