HEX

File: C:/github_repos/casibase_customer_0058/split/markdown.go
// Copyright 2025 The Casibase Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//	http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package split

import (
	"fmt"
	"regexp"
	"strings"
)

type MarkdownSplitProvider struct{}

func NewMarkdownSplitProvider() (*MarkdownSplitProvider, error) {
	return &MarkdownSplitProvider{}, nil
}

func ExtractMarkdownTree(markdownText string) map[string]string {
	numberedHeadingPattern := regexp.MustCompile(`^(\d+(\.\d+)*\.)\s+(.+)$`)
	hashHeadingPattern := regexp.MustCompile(`^(#{1,6})\s+(.+)$`)

	lines := strings.Split(markdownText, "\n")
	result := make(map[string]string)

	var currentKey string
	var currentContent []string
	var path []string

	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line == "" {
			continue
		}

		var isHeading bool
		var level int
		var title string

		if m := hashHeadingPattern.FindStringSubmatch(line); m != nil {
			title = fmt.Sprintf("%s %s", m[1], m[2])
			level = len(m[1])
			isHeading = true
		} else if m := numberedHeadingPattern.FindStringSubmatch(line); m != nil {
			title = fmt.Sprintf("%s %s", m[1], m[3])
			level = strings.Count(m[1], ".")
			isHeading = true
		}

		if isHeading {
			if currentKey != "" {
				result[currentKey] = strings.TrimSpace(strings.Join(currentContent, "\n"))
			} else {
				result["root"] = strings.TrimSpace(strings.Join(currentContent, "\n"))
			}

			// update path by level
			if level == len(path)+1 {
				// normal level up
				path = append(path, title)
			} else if level == len(path) {
				// same level, replace the last layer
				path[len(path)-1] = title
			} else if level < len(path) {
				// return to parent level
				path = path[:level-1]
				path = append(path, title)
			} else {
				path = append(path, title)
			}

			currentKey = strings.Join(path, " > ")
			currentContent = []string{}
		} else {
			currentContent = append(currentContent, line)
		}
	}

	if currentKey != "" {
		result[currentKey] = strings.TrimSpace(strings.Join(currentContent, "\n"))
	} else {
		result["root"] = strings.TrimSpace(strings.Join(currentContent, "\n"))
	}

	return result
}

func ExtractTablesAndRemainder(markdownText string) (string, []string, error) {
	tables := []string{}
	remainder := markdownText

	if strings.Contains(markdownText, "|") {
		// Standard Markdown table matching pattern
		borderTablePattern := regexp.MustCompile(`(?m)(?:\n|^)(?:\|.*?\|.*?\|.*?\n)(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)(?:\|.*?\|.*?\|.*?\n)+`)
		borderTables := borderTablePattern.FindAllString(markdownText, -1)
		tables = append(tables, borderTables...)
		remainder = borderTablePattern.ReplaceAllString(remainder, "\n")

		// Borderless Markdown Table Matching Mode
		noBorderTablePattern := regexp.MustCompile(`(?m)(?:\n|^)(?:\S.*?\|.*?\n)(?:(?:\s*[:-]+[-| :]*\s*).*?\n)(?:\S.*?\|.*?\n)+`)
		noBorderTables := noBorderTablePattern.FindAllString(remainder, -1)
		tables = append(tables, noBorderTables...)
		remainder = noBorderTablePattern.ReplaceAllString(remainder, "\n")
	}

	// If the remaining text contains'<table>'(case ignored), try extracting the HTML table
	if strings.Contains(strings.ToLower(remainder), "<table>") {
		htmlTablePattern := regexp.MustCompile(`(?i)(?:\n|^)\s*(?:(?:<html[^>]*>\s*<body[^>]*>\s*<table[^>]*>[\s\S]*?</table>\s*</body>\s*</html>)|(?:<body[^>]*>\s*<table[^>]*>[\s\S]*?</table>\s*</body>)|(?:<table[^>]*>[\s\S]*?</table>))\s*(?:\n|$)`)
		htmlTables := htmlTablePattern.FindAllString(remainder, -1)
		tables = append(tables, htmlTables...)
		remainder = htmlTablePattern.ReplaceAllString(remainder, "\n")
	}
	return remainder, tables, nil
}

func ExtractTablesWithContext(markdownText string, contextKey string) (string, []string, error) {
	remainder, tables, err := ExtractTablesAndRemainder(markdownText)
	if err != nil {
		return "", nil, err
	}

	tablesWithContext := make([]string, len(tables))
	for i, table := range tables {
		tablesWithContext[i] = contextKey + "\n\n" + table
	}

	return remainder, tablesWithContext, nil
}

func (p *MarkdownSplitProvider) SplitText(text string) ([]string, error) {
	headingsMap := ExtractMarkdownTree(text)

	var sections []string

	for key, content := range headingsMap {
		remainder, tables, err := ExtractTablesWithContext(content, key)
		if err != nil {
			return nil, err
		}

		// add tables to sections
		for _, table := range tables {
			sections = append(sections, strings.TrimSpace(table))
		}

		// add text to sections
		if strings.TrimSpace(remainder) != "" {
			textSplitter, err := NewDefaultSplitProvider("markdown")
			if err != nil {
				return nil, err
			}

			textSections, err := textSplitter.SplitText(remainder)
			if err != nil {
				return nil, err
			}

			for _, section := range textSections {
				if strings.TrimSpace(section) != "" {
					sections = append(sections, key+"\n\n"+strings.TrimSpace(section))
				}
			}
		}
	}

	return sections, nil
}