fix:#82 added multithreaded fetching

This commit is contained in:
masterelmar
2023-12-12 12:48:44 +01:00
parent 57e3e41a9a
commit 184dc70be4

View File

@@ -5,6 +5,7 @@ import (
"net/http"
"strconv"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
@@ -29,27 +30,43 @@ func main() {
// Since the ids are not consecutive, it will take a while to fetch all sport courses.
// @TODO: find the highest id and iterate over all ids from 0 to highest id
func fetchAllHtwkSportCourses() []Event {
var events []Event
for i := 0; i <= 9999; i++ {
newEvent, err := fetchHtwkSportCourse("https://sport.htwk-leipzig.de/sportangebote/detail/sport/", i)
if err != nil {
continue
}
events = append(events, newEvent...)
//multithreaded webpage requests to speed up the process
var maxPageID = 9999
var maxThreads = 300
var htmlPageArray = make([]*goquery.Document, maxPageID)
var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/"
var wg sync.WaitGroup
wg.Add(maxThreads)
for i := 0; i < maxThreads; i++ {
go func(i int) {
defer wg.Done()
for j := i; j < maxPageID; j += maxThreads {
doc, err := htmlRequest(url + strconv.Itoa(j))
if err == nil {
htmlPageArray[j] = doc
}
}
}(i)
}
wg.Wait()
println("finished fetching all pages")
//print count of all pages
var events []Event
return events
}
// fetchHtwkSportCourse fetches the sport course from the given url and id.
// If the sport course does not exist, it will return an error.
// If the sport course exists, it will return the sport course.
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
// May be improved in the future.
func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
var events []Event
func htmlRequest(url string) (*goquery.Document, error) {
println("fetching " + url)
resp, err := http.Get(url + strconv.Itoa(id))
resp, err := http.Get(url)
if err != nil {
return nil, err
}
@@ -60,6 +77,18 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
return nil, err
}
println("finished fetching " + url)
return doc, nil
}
// fetchHtwkSportCourse fetches the sport course from the given url and id.
// If the sport course does not exist, it will return an error.
// If the sport course exists, it will return the sport course.
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
// May be improved in the future.
func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
var events []Event
if doc.Find("h1").Text() == "Aktuelle Sportangebote" {
return nil, errors.New("not a sport course page")
}
@@ -73,6 +102,11 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
if len(titleParts) > 0 {
event.Title = strings.TrimSpace(titleParts[0])
}
if len(titleParts) > 2 {
details.Type = strings.TrimSpace(titleParts[len(titleParts)-1])
}
s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) {
key := strings.TrimSpace(s.Find("td").First().Text())
value := strings.TrimSpace(s.Find("td").Last().Text())
@@ -157,6 +191,7 @@ type EventDetails struct {
Location Location
Participants Participants
Cost string
Type string
}
// DateRange represents a start and end date.