File: C:/github_repos/casibase_customer_0058/txt/pdf.go
// Copyright 2023 The Casibase Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package txt
import (
"errors"
"fmt"
"strings"
"github.com/casibase/pdf"
"github.com/pdfcpu/pdfcpu/pkg/api"
)
func getPageTexts(p pdf.Page) (texts []pdf.Text, err error) {
defer func() {
if r := recover(); r != nil {
switch x := r.(type) {
case string:
err = errors.New(x)
case error:
err = x
default:
err = errors.New(fmt.Sprint(x))
}
}
}()
texts = p.Content().Text
return
}
func getTextFromPdf(path string) (string, error) {
err := api.ValidateFile(path, nil)
if err != nil {
err = api.OptimizeFile(path, path, nil)
if err != nil {
return "", err
}
}
f, r, err := pdf.Open(path)
if err != nil {
return "", err
}
defer f.Close()
totalPage := r.NumPage()
var mergedTexts []string
for pageIndex := 1; pageIndex <= totalPage; pageIndex++ {
p := r.Page(pageIndex)
if p.V.IsNull() || p.V.Key("Contents").Kind() == pdf.Null {
continue
}
var lastTextStyle pdf.Text
var mergedSentence string
var texts []pdf.Text
texts, err = getPageTexts(p)
if err != nil {
return "", err
}
defer f.Close()
for _, text := range texts {
if text.Y == lastTextStyle.Y {
mergedSentence += text.S
} else {
if mergedSentence != "" {
mergedTexts = append(mergedTexts, mergedSentence)
}
lastTextStyle = text
mergedSentence = text.S
}
}
if mergedSentence != "" {
mergedTexts = append(mergedTexts, mergedSentence)
}
}
mergedText := strings.Join(mergedTexts, "\n")
return mergedText, nil
}