From b2ef4eb5f27d73f5c797b06cf55c6dbd22b5aff9 Mon Sep 17 00:00:00 2001 From: Elmar Kresse Date: Mon, 13 Jan 2025 11:57:20 +0100 Subject: [PATCH] add scraping functionality for Lipsia and improve error handling for image downloads --- docker-compose.yml | 5 +++ known_properties.json | 2 +- main.py | 4 +- .../__pycache__/webhook.cpython-310.pyc | Bin 1514 -> 1634 bytes src/discord/webhook.py | 7 ++++ src/lipsia/__pycache__/lipsia.cpython-310.pyc | Bin 0 -> 1255 bytes src/lipsia/lipsia.py | 38 ++++++++++++++++++ .../__pycache__/scrape_image.cpython-310.pyc | Bin 1211 -> 1216 bytes src/lwb/__pycache__/scraper.cpython-310.pyc | Bin 2614 -> 2626 bytes src/lwb/scrape_image.py | 3 +- src/lwb/scraper.py | 2 +- .../__pycache__/scraper.cpython-310.pyc | Bin 1405 -> 1421 bytes src/wogetra/scraper.py | 2 +- 13 files changed, 58 insertions(+), 5 deletions(-) create mode 100644 docker-compose.yml create mode 100644 src/lipsia/__pycache__/lipsia.cpython-310.pyc create mode 100644 src/lipsia/lipsia.py diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..7c78c19 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,5 @@ +services: + python: + image: gitlab.dit.htwk-leipzig.de:5050/ekresse/flatscraper:main + container_name: flatscraper + \ No newline at end of file diff --git a/known_properties.json b/known_properties.json index c865079..1aa7521 100644 --- a/known_properties.json +++ b/known_properties.json @@ -1 +1 @@ -["803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", "id-193-1-13", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "B4095706-A65C-F421-B02E-1D227B684B62", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "id-154-2-71", "id-105-12-78", "id-88-5-45"] \ No newline at end of file +["9A0B42A2-6D9B-331A-DAA7-624A5FA49606", "BAAEC20A-F667-FE22-6693-E4B4CA366889", "FA45C0B6-813C-DE65-496A-EDD8DA3F2526", "id-105-12-78", "C02892BE-F34F-5A8A-E174-4A79549DC9A9", 51624, "803DF7B0-1125-2AA4-90AC-CF0CAEAC625A", 51628, 51632, "id-88-5-45", "E7B71D28-C557-CFE4-805D-42C2793E9248", "B57516F9-E364-7E54-A211-527ED54388E6", "21C20126-380D-9B0F-73F8-C4279897F189", "B4095706-A65C-F421-B02E-1D227B684B62", "id-154-2-71", "A979EBAE-EF87-FB51-152A-5453CD7DC794", "892BD779-F186-9BD1-A97A-5783EFB6F56D", "665243A8-FD34-86F8-322F-FE9B0B392083", "57446DF5-CB9F-951C-A40A-4BA775DA7426", "id-193-1-13", "172DFAD2-7CDB-51B4-212E-E6F9C7F0601A", "F7992488-7C24-DFA9-F8B2-94DDC18E66A3", "BD41AC52-BADB-428F-AF4B-11BBC374F2F1", "7421A7DD-D9B4-72D3-0A91-5C3DA60C26E4", "3B73B720-13F2-62A4-8829-557676725A95"] \ No newline at end of file diff --git a/main.py b/main.py index 40cae0b..da472e5 100644 --- a/main.py +++ b/main.py @@ -4,7 +4,7 @@ import time import src.wogetra.scraper as wogetra_scraper import src.lwb.scraper as lwb_scraper import src.discord.webhook as localwebhook - +import src.lipsia.lipsia as lipsia_scraper # URL of the website to scrape TARGET_URL = "https://www.wogetra.de/immobilien-vermarktungsart/miete/" @@ -30,6 +30,8 @@ def main(): properties = wogetra_scraper.scrape_wogetra() print("Scraping properties from LWB...") properties += lwb_scraper.scrape_easysquare() + print("Scraping properties from Lipsia...") + properties += lipsia_scraper.scrape_lipsia() for prop in properties: if prop["id"] not in known_properties: diff --git a/src/discord/__pycache__/webhook.cpython-310.pyc b/src/discord/__pycache__/webhook.cpython-310.pyc index dfbbda8229db0f69a3e9b7718b93a7ee211845cf..1ab96eb08a53b36d415940c5acccfacffccd40db 100644 GIT binary patch delta 609 zcmah_ziSjh6rMLT`{PWqqFydJO>#d1$~8u~SXhaoAQ%WlBqGSMWG3hCCA;hF1j#Xn zf|go@NtGjzl-B+UskF75(o#?nYsJ|+n-t;{Z{GXf_rCAVe3|$>8KhHFXOKOQe!T8} zDD4H`;b8A~_F||Io30`;>|dAY2#u>_7!XBRrH92632f#S0AjNWA{bDKVI1m(7it$qkCEiWmF42g`s49-#i5wV7Am5FO(G zo9VAG-}%QP%85&Cq34K`QC90AabmUW_F=e+hKMx^-1^Xwk6qe!n}4yfx5ZkE?;~6S z?`QF2{kA?Nmv63cE_nS`BA=&18gC;_Bu``u8_Gf%8t*9&85g;so6#2Um!j>k%df^+ zw}nMuRrEet(w~U9=42vDr11rRw!?F22eLGmW@7#B?R%@M_d5^QRt$~vG%*g_+0Jqu z(wg3e0cVJDoouy@;_~1hY zT$r?38P4Ot)=ltV;$Oa7fihWuG8T(=xaEmidx>fN#;MgV6zNP6ik@hk4o&^hss91N C2BE(I delta 561 zcmZ9KF;5#Y6vv-^cJ4wUji9syNSky(L`bQsPL--km0F3J4wX=)=o&leU6Q-Qb_4`R zE$!R^-H4FLn4uFJAH@@5WZ@g2`VJVVhiAXvd+-0ZEMLxEJu1a>b2DJq#lf%cj(1eL zMxCRZrRT5rT)a%02Zoogj_bN~Vz3akX`)`+rRr!XjJkQ8CR= z)gFkc)qZa8h%0am>_x_{56OMtGuUeFj@^1&)~KJI;3C>Eq!aY{s2B|VE^mfO*SPIC zQan-(YzXs}Z)nybM{DU{XRX}8qGX@TxZkRv71=P)SgJuZuWP-HDz1)32)`%#<4`qXHAP)$fiW?VW`{G Zc<%-=8}|8oWHXt44G%mtkBKu&{sGZXnzsM| diff --git a/src/discord/webhook.py b/src/discord/webhook.py index b097ccc..32ce523 100644 --- a/src/discord/webhook.py +++ b/src/discord/webhook.py @@ -33,6 +33,13 @@ def send_to_discord(property_data): # Download the image image_response = scrape_image.scrape_image(property_data["image_url"]) + # Check if the image was downloaded successfully + if image_response == b"": + print("Fehler beim Herunterladen des Bildes: Leere Antwort") + payload = {"content": message} + response = requests.post(WEBHOOK_URL, data=json.dumps(payload), headers=headers) + return + # Send the message with an image attachment files = {"file": ("image.jpg", image_response)} payload = {"content": message} diff --git a/src/lipsia/__pycache__/lipsia.cpython-310.pyc b/src/lipsia/__pycache__/lipsia.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f822c0681ad4bdf55c1132cdbe9fd63f32fe9b19 GIT binary patch literal 1255 zcmZ8h&2QT_6elIwmhCuAyESRD59qLEFwod5dMS)%9k35z*w9ONGSUf*$VyBoQ6=em z5#&o-V1Gn6AU+P*X@9~_`x|)NPQC50U3%K1l&--jJif2rdwhJ4kQ|K$2-e?!Jx_n= zAoQnGE-w$1Pl0p<8$}cgGC9L+t7AE0sPC7{s#F}5Kkh8do-#93ai z5QM8hp8@GT3>onX8FY?ow8kr3;}eg1r@q=YKJ}gU0{!GYgHuxU0M+0u_d$8)&{(0*k ztuftfkXIaX2Q~7Rlj+q9?b;>JTlDRJl0`5t zGK*|i8)6pP_Vgl2nAX+p{{u@UOQsBGdKP_EJRyS9NHJZ?LbK>1I&cSBUl3yvFj-U| z|96$;a*|4!3N}e(KFvv~r){azdr<4)t^LOfrp^vu;hWauwCaD;Ts%66=3&MN6}({c zD}=(4n#I~E0^H*0jja?_ER~s$Drf^!F)1pN2?ehW&4){w6^{gANyZidSfc#5U#srd zM3@<*$T>4CJ_Rji3ng2`A5VXo4>O)-0?5R-E-SZfhvy_^T(np2$GkQ}?>6Jz^JqVl+LZICB07IMr5H)porq>dL#jC(dz3>u@2 zwtmWt9k!j06G@rvlnSP6gA*+a+a+ZQ1Ghe;ABWb5a2MPT$OdlCcJOA1HsFTkst47p zzH8A|JDu1~CU&L%jMN9jnk>i^AQQ#o z8XOq!=@=j6>hI(g#cXO}7{!*DT#}ie7d5$-NlP<|JF%c3Co?$_Bvi#!oLHcnoRgWF zR}#gZoS&PUn3r;kB`+~IHEQw&rYwPUhG2#orY!bg22B<}O{S>HUd-0KnjE({ic$+p zQ;SQAqb5&hHnEQ4E-pzdDJ_mq&QD3b#ad952{A9P1Zc1(a};xFQBD+lacXff$Q?k< z#Rd6!#i=V9ig+fovH0sVvauBL0X?Y6Qp66Via0?8$je~SA|4P6LGVs)VX*-Kgo#ie diff --git a/src/lwb/__pycache__/scraper.cpython-310.pyc b/src/lwb/__pycache__/scraper.cpython-310.pyc index e19655092405be80cc357907e34103a57ca36664..f6c31967cc33b034affc42ea963e8201ff1501cf 100644 GIT binary patch delta 99 zcmdlca!7PXi1mb$Q!`SxNLG9a~$L1&3l=77}-*!YuLmYCZA@JmzF6J zSRhg(w2-kzJVka6_d-TShCkvQ30HpO=@50SNRSw4@7f}F2ZV~m@xvrS`SWSX4Ec?AIL!56^* diff --git a/src/lwb/scrape_image.py b/src/lwb/scrape_image.py index c46b6c1..09f1eef 100644 --- a/src/lwb/scrape_image.py +++ b/src/lwb/scrape_image.py @@ -27,7 +27,8 @@ def scrape_image(url): if response.status_code != 200: print(f"Fehler beim Abrufen von Easysquare: {response.status_code}") - return [] + # return empty image + return b'' # get image from response diff --git a/src/lwb/scraper.py b/src/lwb/scraper.py index 0e9dd18..b622a55 100644 --- a/src/lwb/scraper.py +++ b/src/lwb/scraper.py @@ -101,7 +101,7 @@ def scrape_easysquare(): properties.append({ "id": id, - "title": prop_title, + "title": "LWB - " + prop_title, "subtitle": subtitle, "rooms": format.format_room(rooms), "size": format.format_roomSize(size), diff --git a/src/wogetra/__pycache__/scraper.cpython-310.pyc b/src/wogetra/__pycache__/scraper.cpython-310.pyc index 2c24036bbb601659f3e87dcd373c52e32deb0af2..c774dd2fb80603e9adb06c0d1b5b9435bb3515b6 100644 GIT binary patch delta 169 zcmey%)yvJB&&$ij00JLd(nBZmZepA^@qhv2ri~w+7&TJFYq-Q2YPf57YxrvTYj{$) zYXoWpQ@B$k=5Q}$WMn85t6>gi(3G4U&h&$)iYq)nJ+-7LQ9)N>vOn`C#;D0mEXhDJ qjm3dce)4jbd?_|YCJe~+m4&ZJak2rcl&B)m6-Dx>gu>)_RyhEqxh53= delta 152 zcmeC>{>#Oi&&$ij00b>%P3i6vc{edmns~r~al^)sPK+uwTs7P^yfu6^{53o&+%*C< zf+^f7;&ZqcGBPq0iq$X&GiXXoj%NBXIi7hF Zg|A3?vIDD>s4~#jB1Kd}X>vZR8~`G&BM1Nh diff --git a/src/wogetra/scraper.py b/src/wogetra/scraper.py index 9f1fcdf..e296389 100644 --- a/src/wogetra/scraper.py +++ b/src/wogetra/scraper.py @@ -35,7 +35,7 @@ def scrape_wogetra(): # Add property to list properties.append({ "id": property_id, - "title": title, + "title": "Wogetra - "+ title, "subtitle": subtitle, "rooms": rooms, "size": size,