HEX
Server: Apache/2.4.54 (Win64) OpenSSL/1.1.1p PHP/7.4.30
System: Windows NT website-api 10.0 build 20348 (Windows Server 2016) AMD64
User: SYSTEM (0)
PHP: 7.4.30
Disabled: NONE
Upload Files
File: C:/github_repos/casibase_customer_0022/txt/markitdown.go
// Copyright 2025 The Casibase Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package txt

import (
	"bytes"
	"fmt"
	"io"
	"os"
	"os/exec"
	"unicode/utf8"

	"github.com/casibase/casibase/i18n"
	"golang.org/x/text/encoding/simplifiedchinese"
	"golang.org/x/text/transform"
)

var markitdownExists bool

func init() {
	if _, err := exec.LookPath("markitdown"); err == nil {
		markitdownExists = true
	} else {
		markitdownExists = false
	}
}

func gbkToUtf8(gbkData []byte) (string, error) {
	reader := transform.NewReader(bytes.NewReader(gbkData), simplifiedchinese.GBK.NewDecoder())
	data, err := io.ReadAll(reader)
	if err != nil {
		return "", err
	}

	return string(data), nil
}

func GetTextFromMarkitdown(path string, lang string) (string, error) {
	if !markitdownExists {
		return "", fmt.Errorf(i18n.Translate(lang, "txt:GetTextFromMarkitdown() error, markitdown does not exist"))
	}

	// Open the input file
	file, err := os.Open(path)
	if err != nil {
		return "", fmt.Errorf(i18n.Translate(lang, "txt:GetTextFromMarkitdown() error, failed to open file: %v"), err)
	}
	defer file.Close()

	cmd := exec.Command("markitdown")

	cmd.Stdin = file

	var out, stderr bytes.Buffer
	cmd.Stdout = &out
	cmd.Stderr = &stderr

	err = cmd.Run()
	if err != nil {
		return "", fmt.Errorf(i18n.Translate(lang, "txt:GetTextFromMarkitdown() error, markitdown failed to run: %v: %s"), err, stderr.String())
	}

	outputBytes := out.Bytes()
	if utf8.Valid(outputBytes) {
		return string(outputBytes), nil
	}

	utf8Output, err := gbkToUtf8(outputBytes)
	if err != nil {
		return "", err
	}

	return utf8Output, nil
}