mirror of
https://gitlab.dit.htwk-leipzig.de/htwk-software/htwkalender.git
synced 2025-08-02 17:59:14 +02:00
fix:#82 added multithreaded fetching
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
||||
"net/http"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/PuerkitoBio/goquery"
|
||||
@@ -29,27 +30,43 @@ func main() {
|
||||
// Since the ids are not consecutive, it will take a while to fetch all sport courses.
|
||||
// @TODO: find the highest id and iterate over all ids from 0 to highest id
|
||||
func fetchAllHtwkSportCourses() []Event {
|
||||
var events []Event
|
||||
for i := 0; i <= 9999; i++ {
|
||||
newEvent, err := fetchHtwkSportCourse("https://sport.htwk-leipzig.de/sportangebote/detail/sport/", i)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
events = append(events, newEvent...)
|
||||
|
||||
//multithreaded webpage requests to speed up the process
|
||||
|
||||
var maxPageID = 9999
|
||||
var maxThreads = 300
|
||||
var htmlPageArray = make([]*goquery.Document, maxPageID)
|
||||
var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/"
|
||||
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(maxThreads)
|
||||
|
||||
for i := 0; i < maxThreads; i++ {
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
for j := i; j < maxPageID; j += maxThreads {
|
||||
doc, err := htmlRequest(url + strconv.Itoa(j))
|
||||
if err == nil {
|
||||
htmlPageArray[j] = doc
|
||||
}
|
||||
}
|
||||
}(i)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
println("finished fetching all pages")
|
||||
|
||||
//print count of all pages
|
||||
|
||||
var events []Event
|
||||
return events
|
||||
}
|
||||
|
||||
// fetchHtwkSportCourse fetches the sport course from the given url and id.
|
||||
// If the sport course does not exist, it will return an error.
|
||||
// If the sport course exists, it will return the sport course.
|
||||
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
|
||||
// May be improved in the future.
|
||||
func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
|
||||
var events []Event
|
||||
func htmlRequest(url string) (*goquery.Document, error) {
|
||||
println("fetching " + url)
|
||||
|
||||
resp, err := http.Get(url + strconv.Itoa(id))
|
||||
resp, err := http.Get(url)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -60,6 +77,18 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
println("finished fetching " + url)
|
||||
return doc, nil
|
||||
}
|
||||
|
||||
// fetchHtwkSportCourse fetches the sport course from the given url and id.
|
||||
// If the sport course does not exist, it will return an error.
|
||||
// If the sport course exists, it will return the sport course.
|
||||
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
|
||||
// May be improved in the future.
|
||||
func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
|
||||
var events []Event
|
||||
|
||||
if doc.Find("h1").Text() == "Aktuelle Sportangebote" {
|
||||
return nil, errors.New("not a sport course page")
|
||||
}
|
||||
@@ -73,6 +102,11 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
|
||||
if len(titleParts) > 0 {
|
||||
event.Title = strings.TrimSpace(titleParts[0])
|
||||
}
|
||||
|
||||
if len(titleParts) > 2 {
|
||||
details.Type = strings.TrimSpace(titleParts[len(titleParts)-1])
|
||||
}
|
||||
|
||||
s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) {
|
||||
key := strings.TrimSpace(s.Find("td").First().Text())
|
||||
value := strings.TrimSpace(s.Find("td").Last().Text())
|
||||
@@ -157,6 +191,7 @@ type EventDetails struct {
|
||||
Location Location
|
||||
Participants Participants
|
||||
Cost string
|
||||
Type string
|
||||
}
|
||||
|
||||
// DateRange represents a start and end date.
|
||||
|
Reference in New Issue
Block a user