HEX
Server: Apache/2.4.54 (Win64) OpenSSL/1.1.1p PHP/7.4.30
System: Windows NT website-api 10.0 build 20348 (Windows Server 2016) AMD64
User: SYSTEM (0)
PHP: 7.4.30
Disabled: NONE
Upload Files
File: C:/github_repos/casibase/txt/pdf_alibaba_cloud.go
// Copyright 2025 The Casibase Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package txt

import (
	"bytes"
	"crypto/tls"
	"encoding/base64"
	"encoding/json"
	"fmt"
	"io"
	"net/http"
	"os"
	"path/filepath"

	"github.com/beego/beego/logs"
	"github.com/carmel/gooxml/document"
	"github.com/pdfcpu/pdfcpu/pkg/api"
)

const (
	// Alibaba cloud market
	appCode    = ""
	apiHost    = "https://generalpdf.market.alicloudapi.com"
	apiPath    = "/ocrservice/pdf"
	pdfDirPath = "papar_QA_dataset\\papers"
	tmpDir     = "tmp_splits"
	outDir     = "outputdir"
)

type OcrResponse struct {
	FileBase64 string `json:"fileBase64"`
}

type RequestBody struct {
	FileBase64 string `json:"fileBase64"`
	FileType   string `json:"fileType"`
}

func processPdf(localPDFPath, number string) error {
	err := os.RemoveAll(tmpDir)
	if err != nil {
		return err
	}

	ctx, err := api.ReadContextFile(localPDFPath)
	if err != nil {
		return err
	}

	pageCount := ctx.PageCount
	logs.Info("Total pages: %d\n", pageCount)

	ans := ""
	batchSize := 20
	for start := 1; start <= pageCount; start += batchSize {
		err = os.Mkdir(tmpDir, 0o755)
		if err != nil {
			return err
		}

		end := start + batchSize - 1
		if end > pageCount {
			end = pageCount
		}

		rangeStr := fmt.Sprintf("%d-%d", start, end)
		err = api.ExtractPagesFile(localPDFPath, tmpDir, []string{rangeStr}, nil)
		if err != nil {
			return err
		}

		files, err := os.ReadDir(tmpDir)
		if err != nil {
			return err
		}

		pagesToMerge := []string{}
		for _, f := range files {
			if !f.IsDir() && filepath.Ext(f.Name()) == ".pdf" {
				pagesToMerge = append(pagesToMerge, filepath.Join(tmpDir, f.Name()))
			}
		}

		mergedFile := filepath.Join(tmpDir, fmt.Sprintf("merged_%d_%d.pdf", start, end))

		err = api.MergeCreateFile(pagesToMerge, mergedFile, false, nil)
		if err != nil {
			return fmt.Errorf("failed to extract pages %s: %v", rangeStr, err)
		}

		res := parsePdf(mergedFile)
		ans = ans + res

		err = os.RemoveAll(tmpDir)
		if err != nil {
			return err
		}
	}

	outputFile := outDir + "\\" + number + ".txt"
	err = os.WriteFile(outputFile, []byte(ans), 0o644)
	if err != nil {
		return fmt.Errorf("write to %s failed: %v", outputFile, err)
	}

	logs.Info("Saved result to:", outputFile)

	return nil
}

func parsePdf(outputFile string) string {
	b64, err := fileToBase64(outputFile)
	if err != nil {
		return fmt.Sprintf("file to base64 failed: %v", err)
	}

	reqBody := RequestBody{
		FileBase64: b64,
		FileType:   "word",
	}

	jsonData, err := json.Marshal(reqBody)
	if err != nil {
		return fmt.Sprintf("json marshal failed: %v", err)
	}

	url := apiHost + apiPath
	req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
	if err != nil {
		return fmt.Sprintf("create request failed: %v", err)
	}

	req.Header.Add("Authorization", "APPCODE "+appCode)
	req.Header.Add("Content-Type", "application/json; charset=UTF-8")

	tr := &http.Transport{
		TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
	}
	client := &http.Client{Transport: tr}

	resp, err := client.Do(req)
	if err != nil {
		return fmt.Sprintf("http request failed: %v", err)
	}

	defer resp.Body.Close()

	body, err := io.ReadAll(resp.Body)
	if err != nil {
		return fmt.Sprintf("read response failed: %v", err)
	}

	var ocrResp OcrResponse
	err = json.Unmarshal(body, &ocrResp)
	if err != nil {
		return err.Error()
	}

	text, err := extractTextFromWordBase64(ocrResp.FileBase64)
	if err != nil {
		return fmt.Sprintf("extract text from word base64 failed: %v", err)
	}

	return text
}

func extractTextFromWordBase64(base64Str string) (string, error) {
	data, err := base64.StdEncoding.DecodeString(base64Str)
	if err != nil {
		return "", fmt.Errorf("base64 decode error: %v", err)
	}

	doc, err := document.Read(bytes.NewReader(data), int64(len(data)))
	if err != nil {
		return "", fmt.Errorf("failed to read docx: %v", err)
	}

	var text string
	for _, para := range doc.Paragraphs() {
		for _, run := range para.Runs() {
			text += run.Text()
		}
		text += "\n"
	}
	return text, nil
}

func fileToBase64(filePath string) (string, error) {
	data, err := os.ReadFile(filePath)
	if err != nil {
		return "", fmt.Errorf("cannot read file: %v", err)
	}

	encoded := base64.StdEncoding.EncodeToString(data)
	return encoded, nil
}