Files
htwkalender/services/data-manager/service/fetch/v1/htmlParsingFunctions.go
2024-10-07 20:50:00 +02:00

248 lines
6.4 KiB
Go

//Calendar implementation for the HTWK Leipzig timetable. Evaluation and display of the individual dates in iCal format.
//Copyright (C) 2024 HTWKalender support@htwkalender.de
//This program is free software: you can redistribute it and/or modify
//it under the terms of the GNU Affero General Public License as published by
//the Free Software Foundation, either version 3 of the License, or
//(at your option) any later version.
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU Affero General Public License for more details.
//You should have received a copy of the GNU Affero General Public License
//along with this program. If not, see <https://www.gnu.org/licenses/>.
package v1
import (
"golang.org/x/net/html"
"log/slog"
"strings"
)
// Find the first <table> element in the HTML document
func findFirstTable(node *html.Node) *html.Node {
if node.Type == html.ElementNode && node.Data == "table" {
return node
}
// Traverse child nodes recursively
for child := node.FirstChild; child != nil; child = child.NextSibling {
found := findFirstTable(child)
if found != nil {
return found
}
}
return nil
}
// Find the first <span> element with the specified class attribute value
func findFirstSpanWithClass(node *html.Node, classValue string) *html.Node {
if node == nil {
return nil
}
// Check if the current node is a <span> element with the specified class attribute value
if node.Type == html.ElementNode && node.Data == "span" {
if hasClassAttribute(node, classValue) {
return node
}
}
// Traverse child nodes recursively
for child := node.FirstChild; child != nil; child = child.NextSibling {
found := findFirstSpanWithClass(child, classValue)
if found != nil {
return found
}
}
return nil
}
// Check if the specified element has the specified class attribute value
func hasClassAttribute(node *html.Node, classValue string) bool {
for _, attr := range node.Attr {
if attr.Key == "class" && strings.Contains(attr.Val, classValue) {
return true
}
}
return false
}
type dayTable struct {
day string
table []*html.Node
}
// Get Tables with days
func getEventTables(node *html.Node, dayLabels []string) map[string][]*html.Node {
// Create a map to store the tables with the corresponding day from the dayLabels
dayTablesMap := make(map[string][]*html.Node)
tables := findTables(node)
// Ensure we have the same number of tables as day labels
if len(tables) != len(dayLabels) {
// Handle the case where the number of tables doesn't match the dayLabels (log error or return early)
slog.Error("Number of tables does not match number of day labels")
return dayTablesMap // Returning empty map
}
// Iterate over dayLabels and their corresponding tables
for i, day := range dayLabels {
rows := findTableRows(tables[i])
// check that rows exist and skip the header
if len(rows) > 1 {
rows = rows[1:] // Skip header row
// Add the event rows to the map entry for this day
dayTablesMap[day] = rows
}
}
// Remove days that have no events (empty slices)
for day, eventTable := range dayTablesMap {
if len(eventTable) == 0 {
delete(dayTablesMap, day)
}
}
return dayTablesMap
}
// Get Tables with days
func getAllDayLabels(node *html.Node) []string {
paragraphs := findParagraphs(node)
var dayArray []string
for _, p := range paragraphs {
label := getDayLabel(p)
if label != "" {
dayArray = append(dayArray, label)
}
}
return dayArray
}
// Find all <p> elements in the HTML document
func findParagraphs(node *html.Node) []*html.Node {
var paragraphs []*html.Node
if node.Type == html.ElementNode && node.Data == "p" {
paragraphs = append(paragraphs, node)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
paragraphs = append(paragraphs, findParagraphs(child)...)
}
return paragraphs
}
// Find all <tr> elements in <tbody>, excluding the first one
func findTableRows(node *html.Node) []*html.Node {
var tableRows []*html.Node
if node.Type == html.ElementNode && node.Data == "tbody" {
child := node.FirstChild
for child != nil {
if child.Type == html.ElementNode && child.Data == "tr" {
tableRows = append(tableRows, child)
}
child = child.NextSibling
}
}
// Traverse child nodes recursively
for child := node.FirstChild; child != nil; child = child.NextSibling {
var tableRowElement = findTableRows(child)
if tableRowElement != nil {
tableRows = append(tableRows, tableRowElement...)
}
}
// check if tableRows is nil
if tableRows == nil {
return []*html.Node{}
} else {
return tableRows
}
}
// Find all <p> elements in the HTML document
func findTables(node *html.Node) []*html.Node {
var tables []*html.Node
if node.Type == html.ElementNode && node.Data == "table" {
tables = append(tables, node)
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
tables = append(tables, findDayTables(child)...)
}
return tables
}
// Find all <p> elements in the HTML document
func findDayTables(node *html.Node) []*html.Node {
var tables []*html.Node
for child := node.FirstChild; child != nil; child = child.NextSibling {
tables = append(tables, findDayTables(child)...)
}
if node.Type == html.ElementNode && node.Data == "table" && hasClassAttribute(node, "spreadsheet") {
tables = append(tables, node)
}
return tables
}
// Get the text content of the specified node and its descendants
func getDayLabel(node *html.Node) string {
child := node.FirstChild
if child != nil {
if child.Type == html.ElementNode && child.Data == "span" {
if child.FirstChild != nil {
return child.FirstChild.Data
}
}
}
return ""
}
// Find all <td> elements in the current <tr>
func findTableData(node *html.Node) []*html.Node {
var tableData []*html.Node
if node.Type == html.ElementNode && node.Data == "tr" {
child := node.FirstChild
for child != nil {
if child.Type == html.ElementNode && child.Data == "td" {
tableData = append(tableData, child)
}
child = child.NextSibling
}
}
return tableData
}
// Get the text content of the specified node and its descendants
func getTextContent(node *html.Node) string {
var textContent string
if node.Type == html.TextNode {
textContent = node.Data
}
for child := node.FirstChild; child != nil; child = child.NextSibling {
textContent += getTextContent(child)
}
return textContent
}