Files
htwkalender/services/data-manager/service/fetch/sport/sportFetcher.go
2025-04-20 14:13:14 +02:00

573 lines
17 KiB
Go

//Calendar implementation for the HTWK Leipzig timetable. Evaluation and display of the individual dates in iCal format.
//Copyright (C) 2024 HTWKalender support@htwkalender.de
//This program is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU Affero General Public License for more details.
//You should have received a copy of the GNU Affero General Public License
//along with this program. If not, see <https://www.gnu.org/licenses/>.
package sport
import (
"errors"
"github.com/google/uuid"
"github.com/pocketbase/pocketbase"
"github.com/pocketbase/pocketbase/tools/types"
"htwkalender/data-manager/model"
"htwkalender/data-manager/service/db"
"htwkalender/data-manager/service/functions"
clock "htwkalender/data-manager/service/functions/time"
"io"
"log/slog"
"net/http"
"regexp"
"strconv"
"strings"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
)
// FetchAndUpdateSportEvents fetches all sport events from the HTWK sport website
// it deletes them first and then saves them to the database
// It returns all saved events
func FetchAndUpdateSportEvents(app *pocketbase.PocketBase) ([]model.Event, error) {
sportCourseLinks, err := fetchAllAvailableSportCourses()
if err != nil {
return nil, err
}
sportEntries := fetchHTWKSportCourses(sportCourseLinks)
events := formatEntriesToEvents(sportEntries)
var earliestDate time.Time
var latestDate time.Time
// find earliest and latest date in events
for _, event := range events {
if event.Start.Time().Before(earliestDate) {
earliestDate = event.Start.Time()
}
if event.End.Time().After(latestDate) {
latestDate = event.End.Time()
}
}
// get all events from database where name = Feiertage und lehrveranstaltungsfreie Tage
holidays, err := db.GetAllModulesByNameAndDateRange(app, "Feiertage und lehrveranstaltungsfreie Tage", earliestDate, latestDate)
if err != nil {
return nil, err
}
// remove all events that have same year, month and day as items in holidays
for _, holiday := range holidays {
for i, event := range events {
if event.Start.Time().Year() == holiday.Start.Time().Year() &&
event.Start.Time().Month() == holiday.Start.Time().Month() &&
event.Start.Time().Day() == holiday.Start.Time().Day() {
events = append(events[:i], events[i+1:]...)
}
}
}
// @TODO: delete and save events in one transaction and it only should delete events that are not in the new events list and save events that are not in the database
err = db.DeleteAllEventsByCourse(app, "Sport", functions.GetCurrentSemesterString(clock.RealClock{}))
if err != nil {
return nil, err
}
// save events to database
savedEvents, err := db.SaveEvents(events, app)
if err != nil {
return nil, err
}
return savedEvents, nil
}
func formatEntriesToEvents(entries []model.SportEntry) []model.Event {
var events []model.Event
for _, entry := range entries {
eventStarts, eventEnds := getWeekEvents(entry.Details.DateRange.Start, entry.Details.DateRange.End, entry.Details.Cycle)
for j := range eventStarts {
start, _ := types.ParseDateTime(eventStarts[j].In(time.UTC))
end, _ := types.ParseDateTime(eventEnds[j].In(time.UTC))
var event = model.Event{
UUID: uuid.NewSHA1(uuid.NameSpaceDNS, []byte(entry.Title+entry.ID+entry.Details.Type)).String(),
Day: toGermanWeekdayString(start.Time().Weekday()),
Week: strconv.Itoa(23),
Start: start,
End: end,
Name: entry.Title + " (" + entry.ID + ")",
EventType: entry.Details.Type,
Prof: entry.Details.CourseLead.Name,
Rooms: entry.Details.Location.Name,
Notes: entry.AdditionalNote,
BookedAt: "",
Course: "Sport",
Semester: checkSemester(entry.Details.DateRange.Start),
}
events = append(events, event)
}
}
return events
}
func getDayInt(weekDay string) (int, error) {
var weekDayInt int
var err error = nil
switch weekDay {
case "Mo":
weekDayInt = 1
case "Di":
weekDayInt = 2
case "Mi":
weekDayInt = 3
case "Do":
weekDayInt = 4
case "Fr":
weekDayInt = 5
case "Sa":
weekDayInt = 6
case "So":
weekDayInt = 0
default:
{
err = errors.New("no day found")
weekDayInt = -1
}
}
return weekDayInt, err
}
func toGermanWeekdayString(weekday time.Weekday) string {
switch weekday {
case time.Monday:
return "Montag"
case time.Tuesday:
return "Dienstag"
case time.Wednesday:
return "Mittwoch"
case time.Thursday:
return "Donnerstag"
case time.Friday:
return "Freitag"
case time.Saturday:
return "Samstag"
case time.Sunday:
return "Sonntag"
default:
return ""
}
}
func extractStartAndEndTime(cycle string) (int, int, int, int) {
timeRegExp, _ := regexp.Compile("[0-9]{2}:[0-9]{2}")
times := timeRegExp.FindAllString(cycle, 2)
startHour, _ := strconv.Atoi(times[0][0:2])
startMinute, _ := strconv.Atoi(times[0][3:5])
endHour, _ := strconv.Atoi(times[1][0:2])
endMinute, _ := strconv.Atoi(times[1][3:5])
return startHour, startMinute, endHour, endMinute
}
func getWeekEvents(start time.Time, end time.Time, cycle string) ([]time.Time, []time.Time) {
var weekEvents []model.SportDayStartEnd
// split by regexp to get the cycle parts
var cycleParts = splitByCommaWithTime(cycle)
for _, cyclePart := range cycleParts {
//cut string at the first integer/number
cyclePartWithDaysOnly := cyclePart[0:strings.IndexFunc(cyclePart, func(r rune) bool { return r >= '0' && r <= '9' })]
// check if cycle has multiple days by checking if it has a plus sign
if strings.Contains(cyclePartWithDaysOnly, "+") {
// find all days in cycle part by regexp
dayRegExp, _ := regexp.Compile("[A-Z][a-z]")
days := dayRegExp.FindAllString(cyclePart, -1)
startHour, startMinute, endHour, endMinute := extractStartAndEndTime(cyclePart)
// creating a SportDayStartEnd for each day in the cycle
for _, day := range days {
weekDay, err := getDayInt(day)
if err != nil {
slog.Error("Error while getting day int: "+day+" ", "error", err)
} else {
weekEvents = append(weekEvents, model.SportDayStartEnd{
Start: time.Date(start.Year(), start.Month(), start.Day(), startHour, startMinute, 0, 0, start.Location()),
End: time.Date(end.Year(), end.Month(), end.Day(), endHour, endMinute, 0, 0, end.Location()),
Day: time.Weekday(weekDay),
})
}
}
}
// check if cycle has multiple days by checking if it has a minus sign
if strings.Contains(cyclePartWithDaysOnly, "-") {
// find all days in cycle part by regexp
dayRegExp, _ := regexp.Compile("[A-Z][a-z]")
days := dayRegExp.FindAllString(cyclePart, 2)
startHour, startMinute, endHour, endMinute := extractStartAndEndTime(cyclePart)
var startI, endI int
var endIErr, startIErr error
startI, startIErr = getDayInt(days[0])
endI, endIErr = getDayInt(days[1])
if endIErr != nil || startIErr != nil {
slog.Error("StartError while getting day int: "+days[0]+" - "+days[1]+" :", "error", startIErr)
slog.Error("EndError while getting day int: "+days[0]+" - "+days[1]+" :", "error", endIErr)
} else {
//create a int array with all days from start to end day
var daysBetween []int
for i := startI; i <= endI; i++ {
daysBetween = append(daysBetween, i)
}
// creating a SportDayStartEnd for each day in the cycle
weekEvents = createEventListFromStartToEndMatchingDay23(daysBetween, start, startHour, startMinute, end, endHour, endMinute)
}
}
// check if cycle has only one day
if !strings.Contains(cyclePartWithDaysOnly, "-") && !strings.Contains(cyclePartWithDaysOnly, "+") {
// find all days in cycle part by regexp
dayRegExp, _ := regexp.Compile("[A-Z][a-z]")
days := dayRegExp.FindAllString(cyclePart, -1)
startHour, startMinute, endHour, endMinute := extractStartAndEndTime(cyclePart)
var dayNumbers []int
for _, day := range days {
dayInt, err := getDayInt(day)
if err != nil {
slog.Error("Error while getting day int: "+day+" ", "error", err)
} else {
dayNumbers = append(dayNumbers, dayInt)
}
}
// creating a SportDayStartEnd for each day in the cycle
weekEvents = append(weekEvents, createEventListFromStartToEndMatchingDay23(dayNumbers, start, startHour, startMinute, end, endHour, endMinute)...)
for _, day := range days {
weekDay, err := getDayInt(day)
if err != nil {
slog.Error("Error while getting day int: "+day+" ", "error", err)
} else {
weekEvents = append(weekEvents, model.SportDayStartEnd{
Start: time.Date(start.Year(), start.Month(), start.Day(), startHour, startMinute, 0, 0, start.Location()),
End: time.Date(end.Year(), end.Month(), end.Day(), endHour, endMinute, 0, 0, end.Location()),
Day: time.Weekday(weekDay),
})
}
}
}
}
var startDatesList []time.Time
var endDatesList []time.Time
for _, weekEvent := range weekEvents {
startDates, endDates := createEventListFromStartToEndMatchingDay(weekEvent)
startDatesList = append(startDatesList, startDates...)
endDatesList = append(endDatesList, endDates...)
}
return startDatesList, endDatesList
}
// creating a SportDayStartEnd for each day in the cycle
func createEventListFromStartToEndMatchingDay23(days []int, start time.Time, startHour int, startMinute int, end time.Time, endHour int, endMinute int) []model.SportDayStartEnd {
var weekEvents []model.SportDayStartEnd
for _, day := range days {
weekEvents = append(weekEvents, model.SportDayStartEnd{
Start: time.Date(start.Year(), start.Month(), start.Day(), startHour, startMinute, 0, 0, start.Location()),
End: time.Date(end.Year(), end.Month(), end.Day(), endHour, endMinute, 0, 0, end.Location()),
Day: time.Weekday(day),
})
}
return weekEvents
}
func createEventListFromStartToEndMatchingDay(weekEvent model.SportDayStartEnd) ([]time.Time, []time.Time) {
var startDates []time.Time
var endDates []time.Time
for d := weekEvent.Start; d.Before(weekEvent.End); d = d.AddDate(0, 0, 1) {
if d.Weekday() == weekEvent.Day {
startDates = append(startDates, time.Date(d.Year(), d.Month(), d.Day(), weekEvent.Start.Hour(), weekEvent.Start.Minute(), 0, 0, d.Location()))
endDates = append(endDates, time.Date(d.Year(), d.Month(), d.Day(), weekEvent.End.Hour(), weekEvent.End.Minute(), 0, 0, d.Location()))
}
}
return startDates, endDates
}
func splitByCommaWithTime(input string) []string {
var result []string
// Split by comma
parts := strings.Split(input, ", ")
// Regular expression to match a day with time
regex := regexp.MustCompile(`([A-Za-z]{2,}(\+[A-Za-z]{2,})* \d{2}:\d{2}-\d{2}:\d{2})`)
// Iterate over parts and combine when necessary
var currentPart string
for _, part := range parts {
if regex.MatchString(part) {
if currentPart != "" {
currentPart += ", " + part
result = append(result, currentPart)
currentPart = ""
} else {
result = append(result, part)
}
// If the part contains a day with time, start a new currentPart
} else {
// If there's no currentPart, start a new one
if currentPart != "" {
currentPart += ", " + part
} else {
currentPart = part
}
}
}
// Add the last currentPart to the result
if currentPart != "" {
result = append(result, currentPart)
}
return result
}
// check if ws or ss
func checkSemester(date time.Time) string {
if date.Month() >= 4 && date.Month() <= 9 {
return "ss"
} else {
return "ws"
}
}
// fetch the main page where all sport courses are listed and extract all links to the sport courses
func fetchAllAvailableSportCourses() ([]string, error) {
var url = "https://sport.htwk-leipzig.de/sportangebote"
var doc, err = htmlRequest(url)
if err != nil {
slog.Error("Error while fetching sport courses from webpage", "error", err)
return nil, err
}
// link list of all sport courses
var links []string
// find all links to sport courses with regex https://sport.htwk-leipzig.de/sportangebote/detail/sport/ + [0-9]{1,4}
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
if strings.HasPrefix(link, "/sportangebote/detail/sport/") {
links = append(links, link)
}
})
return links, nil
}
// fetchAllHTWKSportCourses fetches all sport courses from the given links.
// to speed up the process, it uses multithreading.
func fetchHTWKSportCourses(links []string) []model.SportEntry {
//multithreaded webpage requests to speed up the process
var maxThreads = 10
var htmlPageArray = make([]*goquery.Document, len(links))
var hostUrl = "https://sport.htwk-leipzig.de"
var wg sync.WaitGroup
wg.Add(maxThreads)
for i := 0; i < maxThreads; i++ {
go func(i int) {
for j := i; j < len(links); j += maxThreads {
doc, err := htmlRequest(hostUrl + links[j])
if err == nil {
htmlPageArray[j] = doc
}
}
wg.Done()
}(i)
}
wg.Wait()
var events []model.SportEntry
for _, doc := range htmlPageArray {
if doc != nil {
event, err := fetchHtwkSportCourse(doc)
if err == nil {
events = append(events, event...)
}
}
}
return events
}
func htmlRequest(url string) (*goquery.Document, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer func(Body io.ReadCloser) {
readErr := Body.Close()
if readErr != nil {
slog.Error("Error while closing response body from html request", "error", readErr)
return
}
}(resp.Body)
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, err
}
return doc, nil
}
// fetchHtwkSportCourse fetches the sport course from the given url and id.
// If the sport course does not exist, it will return an error.
// If the sport course exists, it will return the sport course.
// goquery is used to parse the html. The html structure is not very consistent, so it is hard to parse.
// May be improved in the future.
func fetchHtwkSportCourse(doc *goquery.Document) ([]model.SportEntry, error) {
var events []model.SportEntry
germanTime, _ := time.LoadLocation("Europe/Berlin")
if doc.Find("h1").Text() == "Aktuelle Sportangebote" {
return nil, errors.New("not a sport course page")
}
doc.Find(".eventHead").Each(func(i int, s *goquery.Selection) {
var event model.SportEntry
var details model.EventDetails
fullTitle := strings.TrimSpace(s.Find("h3").Text())
titleParts := strings.Split(fullTitle, "-")
if len(titleParts) > 0 {
event.Title = strings.TrimSpace(titleParts[0])
}
if len(titleParts) > 2 {
details.Type = strings.TrimSpace(titleParts[len(titleParts)-1])
}
event.ID = parseEventID(fullTitle)
s.NextFiltered("table.eventDetails").Find("tr").Each(func(i int, s *goquery.Selection) {
key := strings.TrimSpace(s.Find("td").First().Text())
value := strings.TrimSpace(s.Find("td").Last().Text())
switch key {
case "Zeitraum":
dates := strings.Split(value, "-")
if len(dates) == 2 {
startDate, _ := time.ParseInLocation("02.01.2006", strings.TrimSpace(dates[0]), germanTime)
endDate, _ := time.ParseInLocation("02.01.2006", strings.TrimSpace(dates[1]), germanTime)
details.DateRange = model.DateRange{Start: startDate, End: endDate}
}
case "Zyklus":
details.Cycle = value
case "Geschlecht":
details.Gender = value
case "Leiter":
leaderName := strings.TrimSpace(s.Find("td a").Text())
leadersSlice := strings.Split(leaderName, "\n")
for i, leader := range leadersSlice {
leadersSlice[i] = strings.TrimSpace(leader)
}
formattedLeaders := strings.Join(leadersSlice, ", ")
leaderLink, _ := s.Find("td a").Attr("href")
details.CourseLead = model.CourseLead{Name: formattedLeaders, Link: leaderLink}
case "Ort":
locationDetails := strings.Split(value, "(")
if len(locationDetails) == 2 {
details.Location = model.Location{
Name: strings.TrimSpace(locationDetails[0]),
Address: strings.TrimRight(strings.TrimSpace(locationDetails[1]), ")"),
}
}
case "Teilnehmer":
parts := strings.Split(value, "/")
if len(parts) >= 3 {
bookings, _ := strconv.Atoi(strings.TrimSpace(parts[0]))
totalPlaces, _ := strconv.Atoi(strings.TrimSpace(parts[1]))
waitList, _ := strconv.Atoi(strings.TrimSpace(parts[2]))
details.Participants = model.Participants{Bookings: bookings, TotalPlaces: totalPlaces, WaitList: waitList}
}
case "Kosten":
details.Cost = value // makes no sense since you need to be logged in to see the price
case "Hinweis":
var allNotes []string
s.Find("td").Last().Contents().Each(func(i int, s *goquery.Selection) {
if s.Is("h4.eventAdvice") || goquery.NodeName(s) == "#text" {
note := strings.TrimSpace(s.Text())
if note != "" {
allNotes = append(allNotes, note)
}
}
})
event.AdditionalNote = strings.Join(allNotes, " ")
}
})
event.Details = details
events = append(events, event)
})
return events, nil
}
// parseEventID from fulltitle
// the event id is a number in the fulltitle thats not a time like HH:MM and shoudl be found after Nr. or Nr:
func parseEventID(fulltitle string) string {
var eventID string
var numberRegExp = regexp.MustCompile("[0-9]{1,4}")
var fulltitleParts = strings.Split(fulltitle, " ")
for i, part := range fulltitleParts {
if part == "Nr." || part == "Nr:" {
eventID = fulltitleParts[i+1]
break
}
}
if eventID == "" {
eventID = numberRegExp.FindString(fulltitle)
}
return eventID
}