import hljs from 'highlight.js/lib/core';
import java from 'highlight.js/lib/languages/java';
import 'highlight.js/styles/a11y-dark.css';
import {useEffect} from "react";

export default function Content1() {
    useEffect(() => {
        document.title = 'How to Extract Text from a PDF using Java and PDFBox';
        document.description = 'You want to extract text from a pdf? Do you know how to program in Java? Great! Lets dive in with some code!t' +
            '';

        hljs.registerLanguage('java', java);
        hljs.highlightAll();
    }, []);

    return (
        <div className="lg:px-28 text-gray-700">
            <h1 className="font-bold text-4xl text-gray-800">
                How to Extract Text from a PDF using Java and PDFBox
            </h1>
            <p className="text-gray-600 border-l-2 pl-4 text-sm mt-6">November 25, 2023</p>
            <p className="text-gray-600 border-l-2 pl-4 mt-0 text-sm">
                Posted by <b>Chris Ryan</b>
            </p>
            <p className="text-lg mt-6">
                <b>Step 1:</b> <a href="https://mvnrepository.com/artifact/org.apache.pdfbox/pdfbox">Download PDFBox</a>,
                the open source library we'll be using to open a pdf. For you fellow maven meeples, I'll save you a click:
            </p>
            <div className="mt-3">
                <pre>
                    <code className="language-java rounded-lg max-w-md mx-auto">
                        {
`<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>3.0.0</version>
</dependency>
`
                        }
                    </code>
                </pre>
            </div>

            <p className="text-lg mt-9">
                <b>Step 2:</b> Create a class and call it CharacterCatcher.java, then copy and paste this code into the file:
            </p>
            <div className="mt-3">
                <pre>
                    <code className="language-java rounded-lg max-w-3xl mx-auto">
                        {
`public class CharacterCatcher extends PDFTextStripper {
    private List<TextPosition> textPositions = new ArrayList<>();

    public CharacterCatcher() throws IOException {
    }

    @Override
    protected void writeString(String string, List<TextPosition> textPositions) 
        throws IOException 
    {
        this.textPositions.addAll(textPositions);
    }

    public List<TextPosition> getTextPositions() {
        return this.textPositions;
    }
}
`}
                    </code>
                </pre>
            </div>
            <p className="text-lg mt-3">
                You can call this class whatever you want, but I think this one is catchy.
                What is it doing? It’s basically ‘listening’ to PDFBox while it reads in the characters from a PDF,
                and then stores them in an array.
            </p>

            <p className="text-lg mt-9">
                <b>Step 3:</b> And for the final step, read in a PDF file and extract the text! Maybe that’s 2 steps.
            </p>
            <div className="mt-3">
                <pre>
                    <code className="language-java rounded-lg max-w-3xl mx-auto">
                        {
`public static void main(String args[]) throws IOException {
    var pdfFile = new File("a/path/to/your/file.pdf");
    List<TextPosition> everySingleCharacter = new ArrayList<>();
    try (PDDocument pdfDoc = PDDocument.load(pdfFile)) {
        var charCatcher = new CharacterCatcher();
        charCatcher.setSortByPosition(true);
        charCatcher.setStartPage(1);
        charCatcher.setEndPage(4);
        var out = new OutputStreamWriter(new ByteArrayOutputStream());
        charCatcher.writeText(pdfDoc, out);
        everySingleCharacter.addAll(charCatcher.getTextPositions());
    }
}
`}
                    </code>
                </pre>
            </div>
            <p className="text-lg my-3">
                And of course, set whatever start / end page you want for the text extraction (1 based index).
            </p>

            <p className="text-lg mt-9">
                You could stop at step 3 if you're satisfied. But if you, like me, wanted the WORDS from the pdf, and not just the letters..
            </p>
            <p className="text-lg mt-3">
                <b>Step 4:</b> Group the letters into words with this function:
            </p>
            <div className="mt-3">
                <pre>
                    <code className="language-java rounded-lg max-w-3xl mx-auto">
                        {
`public List<List<TextPosition>> groupIntoWords(List<TextPosition> textPositions) {
    textPositions.sort((a, b) -> {
        var yCompare = Float.compare(a.getY(), b.getY());
        if (yCompare == 0) {
            return Float.compare(a.getX(), b.getX());
        } else {
            return yCompare;
        }
    });

    List<List<TextPosition>> words = new ArrayList<>();
    var word = new ArrayList<TextPosition>();

    final double allowableXDistance = 1.0;
    final double allowableYDistance = 1.0;
    for (var pos : textPositions) {
        if (!pos.getUnicode().equals(" ")) {
            if (word.isEmpty()) {
                word.add(pos);
            } else {
                var lastPos = word.get(word.size() - 1);
                var closeAlongX = lastPos.getEndX() - pos.getX() <= allowableXDistance;
                var closeAlongY = lastPos.getY() - pos.getY() <= allowableYDistance;
                if (closeAlongX && closeAlongY) {
                    word.add(pos);
                } else {
                    words.add(word);
                    word = new ArrayList<>();
                    word.add(pos);
                }
            }
        }
    }

    return words;
}
`}
                    </code>
                </pre>
            </div>
            <p className="text-lg mt-3">
                This function will
                1. Sort the letters, 2. Remove spaces, and 3. Group letters that are really close to each other, aka, words.
            </p>

            <p className="text-lg mt-9">
                Lastly, one would use the function like so:
            </p>
            <div className="mt-3">
                <pre>
                    <code className="language-java rounded-lg max-w-3xl mx-auto">
                        {
`public static void main(String args[]) throws IOException {
    var pdfFile = new File("a/path/to/your/file.pdf");
    try (PDDocument pdfDoc = PDDocument.load(pdfFile)) {
        for (int i = 1; i <= pdfDoc.getNumberOfPages(); i++) {
            var charCatcher = new CharacterCatcher();
            charCatcher.setSortByPosition(true);
            charCatcher.setStartPage(i);
            charCatcher.setEndPage(i);
            var out = new OutputStreamWriter(new ByteArrayOutputStream());
            charCatcher.writeText(pdfDoc, out);

            var lettersFromOnePage = charCatcher.getTextPositions();
            
            // Pass the letters into our new handy word making function
            var wordsFromOnePage = groupIntoWords(lettersFromOnePage);
            
            // And then do whatever processing you want 
            // with your liberated-from-a-pdf words
        }
    }
}
`}
                    </code>
                </pre>
            </div>
            <p className="text-lg my-9">
                OK. <b>Done!</b> No more steps. <br />
                Is this the best way to do it? Well, I'm sure there are improvements to be had.
                But does it work? With the pdfs I've tested with, yes!, and I've tested it with a lot of different ones.
                So, hopefully it works for you too.
                Maybe in the next post I'll cover the extraction of transactions from my Chase bank statements. It was.. involved.
            </p>
        </div>
    );
}