HEX
Server: Apache/2.4.54 (Win64) OpenSSL/1.1.1p PHP/7.4.30
System: Windows NT website-api 10.0 build 20348 (Windows Server 2016) AMD64
User: SYSTEM (0)
PHP: 7.4.30
Disabled: NONE
Upload Files
File: C:/github_repos/casibase/txt/pptx.go
// Copyright 2023 The Casibase Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package txt

import (
	"archive/zip"
	"encoding/xml"
	"fmt"
	"io"
	"regexp"
	"strconv"
	"strings"
)

func getPageNumberFromSlideFilename(filename string) int {
	slideRegex := regexp.MustCompile(`ppt/slides/slide(\d+)\.xml`)
	matches := slideRegex.FindStringSubmatch(filename)

	if len(matches) < 2 {
		return -1
	}

	pageNum, err := strconv.Atoi(matches[1])
	if err != nil {
		return -1
	}

	return pageNum
}

func getTextFromPptx(path string) (string, error) {
	r, err := zip.OpenReader(path)
	if err != nil {
		return "", err
	}
	defer r.Close()

	var text strings.Builder

	for _, f := range r.File {
		if strings.HasPrefix(f.Name, "ppt/slides/slide") && strings.HasSuffix(f.Name, ".xml") {
			pageNum := getPageNumberFromSlideFilename(f.Name)

			var slideText strings.Builder

			rc, err := f.Open()
			if err != nil {
				return "", err
			}

			decoder := xml.NewDecoder(rc)
			for {
				token, err := decoder.Token()
				if err == io.EOF {
					break
				}
				if err != nil {
					rc.Close()
					return "", err
				}

				if startElement, ok := token.(xml.StartElement); ok && startElement.Name.Local == "t" {
					var content string
					if err := decoder.DecodeElement(&content, &startElement); err != nil {
						rc.Close()
						return "", err
					}
					slideText.WriteString(content)
					slideText.WriteString(" ")
				}
			}
			rc.Close()

			if slideText.Len() > 0 {
				if pageNum != -1 {
					text.WriteString(fmt.Sprintf("Page %d content is: [%s]", pageNum, slideText.String()))
				} else {
					text.WriteString(fmt.Sprintf("Unknown page content is: [%s]", slideText.String()))
				}
			}

		}
	}

	return text.String(), nil
}