mirror of
https://gitlab.dit.htwk-leipzig.de/htwk-software/htwkalender.git
synced 2025-08-03 10:19:14 +02:00
200 lines
5.5 KiB
Go
200 lines
5.5 KiB
Go
package main
|
|
|
|
import (
|
|
"errors"
|
|
"net/http"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
)
|
|
|
|
// @TODO: reformat the extracted data to the event model that there are events with real start and end dates
|
|
// @TODO: add tests
|
|
// @TODO: add it to the service
|
|
// @TODO: make it like a cron job to fetch the sport courses once a week
|
|
func main() {
|
|
|
|
var sportCourseLinks = fetchAllAvailableSportCourses()
|
|
events := fetchHTWKSportCourses(sportCourseLinks)
|
|
|
|
for _, event := range events {
|
|
println(event.Title)
|
|
}
|
|
}
|
|
|
|
// fetch the main page where all sport courses are listed and extract all links to the sport courses
|
|
func fetchAllAvailableSportCourses() []string {
|
|
var url = "https://sport.htwk-leipzig.de/sportangebote"
|
|
|
|
var doc, err = htmlRequest(url)
|
|
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
|
|
// link list of all sport courses
|
|
var links []string
|
|
|
|
// find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4}
|
|
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
|
|
link, _ := s.Attr("href")
|
|
if strings.HasPrefix(link, "/sportangebote/detail/sport/") {
|
|
links = append(links, link)
|
|
}
|
|
})
|
|
|
|
return links
|
|
}
|
|
|
|
// fetchAllHTWKSportCourses fetches all sport courses from the given links.
|
|
// to speed up the process, it uses multithreading.
|
|
|
|
func fetchHTWKSportCourses(links []string) []Event {
|
|
|
|
//multithreaded webpage requests to speed up the process
|
|
|
|
var maxThreads = 10
|
|
var htmlPageArray = make([]*goquery.Document, len(links))
|
|
var hostUrl = "https://sport.htwk-leipzig.de"
|
|
|
|
var wg sync.WaitGroup
|
|
wg.Add(maxThreads)
|
|
for i := 0; i < maxThreads; i++ {
|
|
go func(i int) {
|
|
for j := i; j < len(links); j += maxThreads {
|
|
doc, err := htmlRequest(hostUrl + links[j])
|
|
if err == nil {
|
|
htmlPageArray[j] = doc
|
|
}
|
|
}
|
|
wg.Done()
|
|
}(i)
|
|
}
|
|
wg.Wait()
|
|
|
|
var events []Event
|
|
|
|
for _, doc := range htmlPageArray {
|
|
if doc != nil {
|
|
event, err := fetchHtwkSportCourse(doc)
|
|
if err == nil {
|
|
events = append(events, event...)
|
|
}
|
|
}
|
|
}
|
|
return events
|
|
}
|
|
|
|
func htmlRequest(url string) (*goquery.Document, error) {
|
|
println("fetching " + url)
|
|
|
|
resp, err := http.Get(url)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
println("finished fetching " + url)
|
|
return doc, nil
|
|
}
|
|
|
|
// fetchHtwkSportCourse fetches the sport course from the given url and id.
|
|
// If the sport course does not exist, it will return an error.
|
|
// If the sport course exists, it will return the sport course.
|
|
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
|
|
// May be improved in the future.
|
|
func fetchHtwkSportCourse(doc *goquery.Document) ([]Event, error) {
|
|
var events []Event
|
|
|
|
if doc.Find("h1").Text() == "Aktuelle Sportangebote" {
|
|
return nil, errors.New("not a sport course page")
|
|
}
|
|
|
|
doc.Find(".eventHead").Each(func(i int, s *goquery.Selection) {
|
|
var event Event
|
|
var details EventDetails
|
|
|
|
fullTitle := strings.TrimSpace(s.Find("h3").Text())
|
|
titleParts := strings.Split(fullTitle, "-")
|
|
if len(titleParts) > 0 {
|
|
event.Title = strings.TrimSpace(titleParts[0])
|
|
}
|
|
|
|
if len(titleParts) > 2 {
|
|
details.Type = strings.TrimSpace(titleParts[len(titleParts)-1])
|
|
}
|
|
|
|
s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) {
|
|
key := strings.TrimSpace(s.Find("td").First().Text())
|
|
value := strings.TrimSpace(s.Find("td").Last().Text())
|
|
|
|
switch key {
|
|
case "Zeitraum":
|
|
dates := strings.Split(value, "-")
|
|
if len(dates) == 2 {
|
|
startDate, _ := time.Parse("02.01.2006", strings.TrimSpace(dates[0]))
|
|
endDate, _ := time.Parse("02.01.2006", strings.TrimSpace(dates[1]))
|
|
details.DateRange = DateRange{Start: startDate, End: endDate}
|
|
}
|
|
case "Zyklus":
|
|
details.Cycle = value
|
|
case "Geschlecht":
|
|
details.Gender = value
|
|
case "Leiter":
|
|
leaderName := strings.TrimSpace(s.Find("td a").Text())
|
|
leadersSlice := strings.Split(leaderName, "\n")
|
|
for i, leader := range leadersSlice {
|
|
leadersSlice[i] = strings.TrimSpace(leader)
|
|
}
|
|
formattedLeaders := strings.Join(leadersSlice, ", ")
|
|
leaderLink, _ := s.Find("td a").Attr("href")
|
|
details.CourseLead = CourseLead{Name: formattedLeaders, Link: leaderLink}
|
|
case "Ort":
|
|
locationDetails := strings.Split(value, "(")
|
|
if len(locationDetails) == 2 {
|
|
details.Location = Location{
|
|
Name: strings.TrimSpace(locationDetails[0]),
|
|
Address: strings.TrimRight(strings.TrimSpace(locationDetails[1]), ")"),
|
|
}
|
|
}
|
|
case "Teilnehmer":
|
|
parts := strings.Split(value, "/")
|
|
if len(parts) >= 3 {
|
|
bookings, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
|
|
totalPlaces, _ := strconv.Atoi(strings.TrimSpace(parts[1]))
|
|
waitList, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
|
|
details.Participants = Participants{Bookings: bookings, TotalPlaces: totalPlaces, WaitList: waitList}
|
|
}
|
|
case "Kosten":
|
|
details.Cost = value // makes no sense since you need to be logged in to see the price
|
|
case "Hinweis":
|
|
var allNotes []string
|
|
|
|
s.Find("td").Last().Contents().Each(func(i int, s *goquery.Selection) {
|
|
if s.Is("h4.eventAdvice") || goquery.NodeName(s) == "#text" {
|
|
note := strings.TrimSpace(s.Text())
|
|
if note != "" {
|
|
allNotes = append(allNotes, note)
|
|
}
|
|
}
|
|
})
|
|
|
|
event.AdditionalNote = strings.Join(allNotes, " ")
|
|
}
|
|
})
|
|
|
|
event.Details = details
|
|
events = append(events, event)
|
|
})
|
|
|
|
return events, nil
|
|
}
|