mirror of
https://gitlab.dit.htwk-leipzig.de/htwk-software/htwkalender.git
synced 2025-08-03 10:19:14 +02:00
fix:#82 added multithreaded fetching
This commit is contained in:
@@ -5,6 +5,7 @@ import (
|
|||||||
"net/http"
|
"net/http"
|
||||||
"strconv"
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
|
"sync"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
"github.com/PuerkitoBio/goquery"
|
"github.com/PuerkitoBio/goquery"
|
||||||
@@ -29,27 +30,43 @@ func main() {
|
|||||||
// Since the ids are not consecutive, it will take a while to fetch all sport courses.
|
// Since the ids are not consecutive, it will take a while to fetch all sport courses.
|
||||||
// @TODO: find the highest id and iterate over all ids from 0 to highest id
|
// @TODO: find the highest id and iterate over all ids from 0 to highest id
|
||||||
func fetchAllHtwkSportCourses() []Event {
|
func fetchAllHtwkSportCourses() []Event {
|
||||||
var events []Event
|
|
||||||
for i := 0; i <= 9999; i++ {
|
|
||||||
newEvent, err := fetchHtwkSportCourse("https://sport.htwk-leipzig.de/sportangebote/detail/sport/", i)
|
|
||||||
if err != nil {
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
events = append(events, newEvent...)
|
|
||||||
|
|
||||||
|
//multithreaded webpage requests to speed up the process
|
||||||
|
|
||||||
|
var maxPageID = 9999
|
||||||
|
var maxThreads = 300
|
||||||
|
var htmlPageArray = make([]*goquery.Document, maxPageID)
|
||||||
|
var url = "https://sport.htwk-leipzig.de/sportangebote/detail/sport/"
|
||||||
|
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
wg.Add(maxThreads)
|
||||||
|
|
||||||
|
for i := 0; i < maxThreads; i++ {
|
||||||
|
go func(i int) {
|
||||||
|
defer wg.Done()
|
||||||
|
for j := i; j < maxPageID; j += maxThreads {
|
||||||
|
doc, err := htmlRequest(url + strconv.Itoa(j))
|
||||||
|
if err == nil {
|
||||||
|
htmlPageArray[j] = doc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}(i)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
wg.Wait()
|
||||||
|
|
||||||
|
println("finished fetching all pages")
|
||||||
|
|
||||||
|
//print count of all pages
|
||||||
|
|
||||||
|
var events []Event
|
||||||
return events
|
return events
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchHtwkSportCourse fetches the sport course from the given url and id.
|
func htmlRequest(url string) (*goquery.Document, error) {
|
||||||
// If the sport course does not exist, it will return an error.
|
println("fetching " + url)
|
||||||
// If the sport course exists, it will return the sport course.
|
|
||||||
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
|
|
||||||
// May be improved in the future.
|
|
||||||
func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
|
|
||||||
var events []Event
|
|
||||||
|
|
||||||
resp, err := http.Get(url + strconv.Itoa(id))
|
resp, err := http.Get(url)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -60,6 +77,18 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
|
|||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
println("finished fetching " + url)
|
||||||
|
return doc, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// fetchHtwkSportCourse fetches the sport course from the given url and id.
|
||||||
|
// If the sport course does not exist, it will return an error.
|
||||||
|
// If the sport course exists, it will return the sport course.
|
||||||
|
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
|
||||||
|
// May be improved in the future.
|
||||||
|
func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
|
||||||
|
var events []Event
|
||||||
|
|
||||||
if doc.Find("h1").Text() == "Aktuelle Sportangebote" {
|
if doc.Find("h1").Text() == "Aktuelle Sportangebote" {
|
||||||
return nil, errors.New("not a sport course page")
|
return nil, errors.New("not a sport course page")
|
||||||
}
|
}
|
||||||
@@ -73,6 +102,11 @@ func fetchHtwkSportCourse(url string, id int) ([]Event, error) {
|
|||||||
if len(titleParts) > 0 {
|
if len(titleParts) > 0 {
|
||||||
event.Title = strings.TrimSpace(titleParts[0])
|
event.Title = strings.TrimSpace(titleParts[0])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if len(titleParts) > 2 {
|
||||||
|
details.Type = strings.TrimSpace(titleParts[len(titleParts)-1])
|
||||||
|
}
|
||||||
|
|
||||||
s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) {
|
s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) {
|
||||||
key := strings.TrimSpace(s.Find("td").First().Text())
|
key := strings.TrimSpace(s.Find("td").First().Text())
|
||||||
value := strings.TrimSpace(s.Find("td").Last().Text())
|
value := strings.TrimSpace(s.Find("td").Last().Text())
|
||||||
@@ -157,6 +191,7 @@ type EventDetails struct {
|
|||||||
Location Location
|
Location Location
|
||||||
Participants Participants
|
Participants Participants
|
||||||
Cost string
|
Cost string
|
||||||
|
Type string
|
||||||
}
|
}
|
||||||
|
|
||||||
// DateRange represents a start and end date.
|
// DateRange represents a start and end date.
|
||||||
|
Reference in New Issue
Block a user