Clean long PDFs for LLM inference
Do you want to convert multi-page PDFs to HTML, so that they can be passed onto an LLM? Follow the steps below.
Get an API key
Prepare your API key to call the Upstage Document Parse API. If you don't have a key, you need to generate one by following the directions in the quick start guide.
Prepare a PDF file
Let's prepare a PDF file. Our PDF file is a multi-page academic paper presented in PDF format, which contains tables and figures.
👉 Link to the PDF file (opens in a new tab)
Split a long PDF into shorter PDFs
Many PDF files are long, and sometimes goes beyond 100 pages which is the maximum page limit for the Document Parse API.
To process such long documents, or even to get faster responses for easier debugging, we can split our long PDF to shorter PDFs.
Run the code below to split a PDF into shorter PDFs and save them to separate files.
Note that we saved the example PDF file above and named it to paper.pdf
.
"""
Requirements: `pip install pymupdf` to import fitz
"""
import os
import fitz
def split_pdf(input_file, batch_size):
# Open input_pdf
input_pdf = fitz.open(input_file)
num_pages = len(input_pdf)
print(f"Total number of pages: {num_pages}")
# Split input_pdf
for start_page in range(0, num_pages, batch_size):
end_page = min(start_page + batch_size, num_pages) - 1
# Write output_pdf to file
input_file_basename = os.path.splitext(input_file)[0]
output_file = f"{input_file_basename}_{start_page}_{end_page}.pdf"
print(output_file)
with fitz.open() as output_pdf:
output_pdf.insert_pdf(input_pdf, from_page=start_page, to_page=end_page)
output_pdf.save(output_file)
# Close input_pdf
input_pdf.close()
# Input arguments
input_file = "paper.pdf" # Replace with a file of your own
batch_size = 10 # Maximum available value is 100
split_pdf(input_file, batch_size)
You can see resulting files in the terminal as follows.
$ ls paper*
paper.pdf paper_0_9.pdf paper_10_12.pdf
Get Document Parse responses
In order to convert the resulting short PDF files to HTML, use code below.
Make sure to replace UPSTAGE_API_KEY
with your secret API key.
"""
Requirements: `pip install requests`
"""
from glob import glob
import json
import os
import requests
API_KEY = "UPSTAGE_API_KEY" # Change this to your API key
def call_document_parse(input_file, output_file):
# Send request
response = requests.post(
"https://api.upstage.ai/v1/document-ai/document-parse",
headers={"Authorization": f"Bearer {API_KEY}"},
files={"document": open(input_file, "rb")})
# Save response
if response.status_code == 200:
with open(output_file, "w") as f:
json.dump(response.json(), f, ensure_ascii=False)
else:
raise ValueError(f"Unexpected status code {response.status_code}.")
# Find all shorter PDFs related to input_file
input_file = "paper.pdf"
short_input_files = glob(os.path.splitext(input_file)[0] + "_*.pdf")
# Send request and save response for all shorter PDFs
for short_input_file in short_input_files:
print(short_input_file)
short_output_file = os.path.splitext(short_input_file)[0] + ".json"
call_document_parse(short_input_file, short_output_file)
You will now see JSON files in the terminal as follows.
$ ls paper*
paper.pdf paper_0_9.json paper_0_9.pdf paper_10_12.json paper_10_12.pdf
Crop elements
Document Parse offers base64 encoding for layout elements that users wish to extract as images. Let's attempt to extract all figures and save them as image files. In the code provided in Step 4 above, you can easily add {'base64_encoding': '["figure"]'}
to the request body, and the API will return a base64 encoded string for all figure elements. The following code then parses the response and saves the first figure to an image file.
"""
Requirements: `pip install requests`
"""
from glob import glob
import json
import os
import requests
API_KEY = "UPSTAGE_API_KEY" # Change this to your API key
def call_document_parse(input_file, output_file):
# Send request
response = requests.post(
"https://api.upstage.ai/v1/document-ai/document-parse",
headers={"Authorization": f"Bearer {API_KEY}"},
data={"base64_encoding": "['figure']"}, # base64 encoding for cropped image of the figure category.
files={"document": open(input_file, "rb")})
# Save response
if response.status_code == 200:
with open(output_file, "w") as f:
json.dump(response.json(), f, ensure_ascii=False)
else:
raise ValueError(f"Unexpected status code {response.status_code}.")
# Find all shorter PDFs related to input_file
input_file = "paper.pdf"
short_input_files = glob(os.path.splitext(input_file)[0] + "_*.pdf")
# Send request and save response for all shorter PDFs
for short_input_file in short_input_files:
print(short_input_file)
short_output_file = os.path.splitext(short_input_file)[0] + ".json"
call_document_parse(short_input_file, short_output_file)
import json
import base64
# Input parameters
input_file = "paper_0_9.pdf"
json_file = "paper_0_9.json"
output_file = "paper_cropped_figure.png"
# Load JSON file to get bounding box of the first figure
with open(json_file, "r") as f:
data = json.load(f)
# Get bounding box for the first figure and crop the image
for element in data["elements"]:
if element["category"] == "figure":
with open (output_file, 'wb') as fh:
fh.write(base64.decodebytes(str.encode(element["base64_encoding"])))
break
The resulting figure can be seen as follows.
Read relevant elements and write to HTML
Not all document elements are relevant.
Sometimes, we want to exclude elements which are irrelevant to the document's context.
Here, we will preprocess the PDF content for LLM compatibility by retaining paragraph
, table
, figure
elements, and discarding header
, footer
elements.
from glob import glob
import json
import os
def get_html(data_file, excludes=[]):
# Read API response data
with open(data_file, "r") as f:
data = json.load(f)
# Get HTML from data and exclude some element categories
htmls = []
for element in data["elements"]:
if element["category"] not in excludes:
htmls.append(element["html"])
return "\n".join(htmls)
def get_merged_html(data_files, excludes=[]):
merged_html = ""
for data_file in sorted(data_files):
print(data_file)
html = get_html(data_file, excludes=["header", "footer"])
merged_html += f"{html}\n"
return merged_html
# Input arguments
input_file = "paper.pdf"
output_file = "paper.html"
# Iterate short PDF data and get HTML
data_files = glob(os.path.splitext(input_file)[0] + "_*.json")
merged_html = get_merged_html(data_files, excludes=["header", "footer"])
# Write merged_html to output_file
with open(output_file, "w") as f:
f.write(merged_html)
Render the resulting HTML in a Web browser
Open up paper.html
in a Web browser.
You will see a page like below.
Chunk the HTML into segments
You can use each HTML tag as a chunk, but if you want to fully utilize a LLM's context length, you can use length-based chunking as follows.
def get_chunks(lines, max_context_length=4096):
chunks = []
current_chunk = []
for line in lines:
if len(" ".join(current_chunk + [line])) <= max_context_length:
current_chunk.append(line)
else:
chunks.append(current_chunk)
current_chunk = [line]
if current_chunk:
chunks.append(current_chunk)
return chunks
with open("paper.html", "r") as f:
lines = f.read().split("\n")
chunks = get_chunks(lines, max_context_length=1024)
print(chunks[0])