Clean long PDFs for LLM inference

Do you want to convert multi-page PDFs to HTML, so that they can be passed onto an LLM? Follow the steps below.

Get an API key

Prepare your API key to call the Upstage Layout Analysis API. If you don't have a key, you need to generate one by following the directions in the quick start guide.

Prepare a PDF file

Let's prepare a PDF file. Our PDF file is a multi-page academic paper presented in PDF format, which contains tables and figures.

👉 Link to the PDF file (opens in a new tab)

Split a long PDF into shorter PDFs

Many PDF files are long, and sometimes goes beyond 100 pages which is the maximum page limit for the Layout Analysis API. To process such long documents, or even to get faster responses for easier debugging, we can split our long PDF to shorter PDFs. Run the code below to split a PDF into shorter PDFs and save them to separate files. Note that we saved the example PDF file above and named it to paper.pdf.

"""
Requirements: `pip install pymupdf` to import fitz
"""
 
import os
import fitz
 
def split_pdf(input_file, batch_size):
    # Open input_pdf
    input_pdf = fitz.open(input_file)
    num_pages = len(input_pdf)
    print(f"Total number of pages: {num_pages}")
 
    # Split input_pdf
    for start_page in range(0, num_pages, batch_size):
        end_page = min(start_page + batch_size, num_pages) - 1
 
        # Write output_pdf to file
        input_file_basename = os.path.splitext(input_file)[0]
        output_file = f"{input_file_basename}_{start_page}_{end_page}.pdf"
        print(output_file)
        with fitz.open() as output_pdf:
            output_pdf.insert_pdf(input_pdf, from_page=start_page, to_page=end_page)
            output_pdf.save(output_file)
 
    # Close input_pdf
    input_pdf.close()
 
# Input arguments
input_file = "paper.pdf"  # Replace with a file of your own
batch_size = 10  # Maximum available value is 100
split_pdf(input_file, batch_size)

You can see resulting files in the terminal as follows.

shell

$ ls paper*
paper.pdf        paper_0_9.pdf  paper_10_12.pdf

Get Layout Analysis responses

In order to convert the resulting short PDF files to HTML, use code below. Make sure to replace UPSTAGE_API_KEY with your secret API key.

"""
Requirements: `pip install requests`
"""
 
from glob import glob
import json
import os
import requests
 
API_KEY = "UPSTAGE_API_KEY"  # Change this to your API key
 
def call_layout_analysis(input_file, output_file):
    # Send request
    response = requests.post(
        "https://api.upstage.ai/v1/document-ai/layout-analysis",
        headers={"Authorization": f"Bearer {API_KEY}"},
        data={"ocr": False},
        files={"document": open(input_file, "rb")})
 
    # Save response
    if response.status_code == 200:
        with open(output_file, "w") as f:
            json.dump(response.json(), f, ensure_ascii=False)
    else:
        raise ValueError(f"Unexpected status code {response.status_code}.")
 
# Find all shorter PDFs related to input_file
input_file = "paper.pdf"
short_input_files = glob(os.path.splitext(input_file)[0] + "_*.pdf")
 
# Send request and save response for all shorter PDFs
for short_input_file in short_input_files:
    print(short_input_file)
    short_output_file = os.path.splitext(short_input_file)[0] + ".json"
    call_layout_analysis(short_input_file, short_output_file)

You will now see JSON files in the terminal as follows.

shell

$ ls paper*
paper.pdf        paper_0_9.json  paper_0_9.pdf   paper_10_12.json paper_10_12.pdf

Crop elements

Layout Analysis responses contain information about the document element bounding boxes, or coordinates, as well as the resulting HTML. Let's use the bounding boxes to crop some elements, such as figures.

"""
Requirements:
- `pip install pymupdf` to import fitz
- `pip install pillow` to import PIL
"""
 
import json
import fitz
 
from PIL import Image
 
def get_page_sizes(data):
    """Get the size of each page."""
    page_sizes = {}
    for page_element in data["metadata"]["pages"]:
        width = page_element["width"]
        height = page_element["height"]
        page_num = page_element["page"]
        page_sizes[page_num] = [width, height]
    return page_sizes
 
 
def pdf2image(input_file, page_num, dpi=300):
    """Open PDF file and convert to image."""
    doc = fitz.open(input_file)
    page = doc[page_num-1].get_pixmap(dpi=dpi)
 
    target_page_size = [page.width, page.height]
    page_img = Image.frombytes("RGB", target_page_size, page.samples)
    doc.close()
    return page_img
 
 
def normalize_coordinates(coordinates, output_page_size):
    # Get x-y coordinates and find min/max values
    x_values = [coord["x"] for coord in coordinates]
    y_values = [coord["y"] for coord in coordinates]
    x1, y1, x2, y2 = min(x_values), min(y_values), max(x_values), max(y_values)
 
    # normalize coordinates and map to page size
    x1 /= output_page_size[0]
    y1 /= output_page_size[1]
    x2 /= output_page_size[0]
    y2 /= output_page_size[1]
    return x1, y1, x2, y2
 
 
def crop_image(img, coordinates, output_file):
    x1, y1, x2, y2 = coordinates
    img_width, img_height = img.size
    x1 = int(x1 * img_width)
    y1 = int(y1 * img_height)
    x2 = int(x2 * img_width)
    y2 = int(y2 * img_height)
    target_coordinates = (x1, y1, x2, y2)
 
    cropped_img = img.crop(target_coordinates)
    cropped_img.save(output_file)
 
# Input parameters
input_file = "paper_0_9.pdf"
json_file = "paper_0_9.json"
output_file = "paper_cropped_figure.png"
 
# Load JSON file to get bounding box of the first figure
with open(json_file, "r") as f:
    data = json.load(f)
 
    # Get the size of each page
    page_sizes = get_page_sizes(data)
 
    # Get bounding box for the first figure and crop the image
    for element in data["elements"]:
        if element["category"] == "figure":
            page_num = element["page"]
            coordinates = element["bounding_box"]
            output_page_size = page_sizes[page_num]
            pdf_image = pdf2image(input_file, page_num)
            normalized_coordinates = normalize_coordinates(coordinates, output_page_size)
            crop_image(pdf_image, normalized_coordinates, output_file)
            break

The resulting figure can be seen as follows.

Read relevant elements and write to HTML

Not all document elements are relevant. Sometimes, we want to exclude elements which are irrelevant to the document's context. Here, we will preprocess the PDF content for LLM compatibility by retaining paragraph, table, figure elements, and discarding header, footer elements.

from glob import glob
import json
import os
 
def get_html(data_file, excludes=[]):
    # Read API response data
    with open(data_file, "r") as f:
        data = json.load(f)
 
    # Get HTML from data and exclude some element categories
    htmls = []
    for element in data["elements"]:
        if element["category"] not in excludes:
            htmls.append(element["html"])
    return "\n".join(htmls)
 
def get_merged_html(data_files, excludes=[]):
    merged_html = ""
    for data_file in sorted(data_files):
        print(data_file)
        html = get_html(data_file, excludes=["header", "footer"])
        merged_html += f"{html}\n"
    return merged_html
 
# Input arguments
input_file = "paper.pdf"
output_file = "paper.html"
 
# Iterate short PDF data and get HTML
data_files = glob(os.path.splitext(input_file)[0] + "_*.json")
merged_html = get_merged_html(data_files, excludes=["header", "footer"])
 
# Write merged_html to output_file
with open(output_file, "w") as f:
    f.write(merged_html)

Render the resulting HTML in a Web browser

Open up paper.html in a Web browser. You will see a page like below.

Convert HTML to Markdown

If for any reason you'd like to convert the final HTML to Markdown, you can use Pandoc (opens in a new tab). Note that <br> tags will be replaced with a backslash and much of the rich information such as font-size will be lost.

shell

brew install pandoc
pandoc -f html -t markdown paper.html -o paper.md

Chunk the HTML into segments

You can use each HTML tag as a chunk, but if you want to fully utilize a LLM's context length, you can use length-based chunking as follows.

def get_chunks(lines, max_context_length=4096):
    chunks = []
    current_chunk = []
 
    for line in lines:
        if len(" ".join(current_chunk + [line])) <= max_context_length:
            current_chunk.append(line)
        else:
            chunks.append(current_chunk)
            current_chunk = [line]
 
    if current_chunk:
        chunks.append(current_chunk)
    return chunks
 
with open("paper.html", "r") as f:
    lines = f.read().split("\n")
 
chunks = get_chunks(lines, max_context_length=1024)
print(chunks[0])

Create your own chatbot Digitize documents