From 184dc70be483e4fe2cbf11acd1c8523467eb8ec9 Mon Sep 17 00:00:00 2001 From: masterelmar <18119527+masterElmar@users.noreply.github.com> Date: Tue, 12 Dec 2023 12:48:44 +0100 Subject: [PATCH] fix:#82 added multithreaded fetching --- backend/sport/main.go | 65 +++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/backend/sport/main.go b/backend/sport/main.go index b62e2c2..31164b2 100644 --- a/backend/sport/main.go +++ b/backend/sport/main.go @@ -5,6 +5,7 @@ import ( "net/http" "strconv" "strings" + "sync" "time" "github.com/PuerkitoBio/goquery" @@ -29,27 +30,43 @@ func main() { // Since the ids are not consecutive, it will take a while to fetch all sport courses. // @TODO: find the highest id and iterate over all ids from 0 to highest id func fetchAllHtwkSportCourses() []Event { - var events []Event - for i := 0; i <= 9999; i++ { - newEvent, err := fetchHtwkSportCourse("https://sport.htwk-leipzig.de/sportangebote/detail/sport/", i) - if err != nil { - continue - } - events = append(events, newEvent...) + //multithreaded webpage requests to speed up the process + + var maxPageID = 9999 + var maxThreads = 300 + var htmlPageArray = make([]*goquery.Document, maxPageID) + var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/" + + var wg sync.WaitGroup + wg.Add(maxThreads) + + for i := 0; i < maxThreads; i++ { + go func(i int) { + defer wg.Done() + for j := i; j < maxPageID; j += maxThreads { + doc, err := htmlRequest(url + strconv.Itoa(j)) + if err == nil { + htmlPageArray[j] = doc + } + } + }(i) } + + wg.Wait() + + println("finished fetching all pages") + + //print count of all pages + + var events []Event return events } -// fetchHtwkSportCourse fetches the sport course from the given url and id. -// If the sport course does not exist, it will return an error. -// If the sport course exists, it will return the sport course. -// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse. -// May be improved in the future. -func fetchHtwkSportCourse(url string, id int) ([]Event, error) { - var events []Event +func htmlRequest(url string) (*goquery.Document, error) { + println("fetching " + url) - resp, err := http.Get(url + strconv.Itoa(id)) + resp, err := http.Get(url) if err != nil { return nil, err } @@ -60,6 +77,18 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) { return nil, err } + println("finished fetching " + url) + return doc, nil +} + +// fetchHtwkSportCourse fetches the sport course from the given url and id. +// If the sport course does not exist, it will return an error. +// If the sport course exists, it will return the sport course. +// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse. +// May be improved in the future. +func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) { + var events []Event + if doc.Find("h1").Text() == "Aktuelle Sportangebote" { return nil, errors.New("not a sport course page") } @@ -73,6 +102,11 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) { if len(titleParts) > 0 { event.Title = strings.TrimSpace(titleParts[0]) } + + if len(titleParts) > 2 { + details.Type = strings.TrimSpace(titleParts[len(titleParts)-1]) + } + s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) { key := strings.TrimSpace(s.Find("td").First().Text()) value := strings.TrimSpace(s.Find("td").Last().Text()) @@ -157,6 +191,7 @@ type EventDetails struct { Location Location Participants Participants Cost string + Type string } // DateRange represents a start and end date.