Many documents
ParaLLeM excels for processing documents in high throughput.
By switching to batch mode, you save 50% on token costs, save CPU time, and can scale up to 1000s of documents.
import os
from pathlib import Path
from dotenv import load_dotenv
import requests
import parallem as pllm
documents = [
"https://www.gutenberg.org/files/43/43-0.txt", # Jekyll and Hyde
"https://www.gutenberg.org/files/1952/1952-0.txt", # Yellow Wallpaper
"https://www.gutenberg.org/files/208/208-0.txt", # Daisy Miller
"https://www.gutenberg.org/files/2641/2641-0.txt", # A Room with a View
"https://www.gutenberg.org/files/64317/64317-0.txt", # The Great Gatsby
]
def download_documents():
"""Download documents to examples/documents/txts."""
txts_dir = Path("examples/documents/txts")
txts_dir.mkdir(parents=True, exist_ok=True)
for url in documents:
filename = url.split("/")[-1]
filepath = txts_dir / filename
if not filepath.exists():
print(f"Downloading {url} to {filepath}...")
response = requests.get(url)
with filepath.open("w", encoding="utf-8") as f:
f.write(response.text)
else:
print(f"{filepath} already exists, skipping download.")
def summarizer_agent(agt: pllm.AgentContext, doc: str):
resp = agt.ask_llm(
"In 1 paragraph, how does the main character change in this literary work?", doc
)
return resp
if __name__ == "__main__":
download_documents()
load_dotenv()
with pllm.resume_directory(
".pllm/example/document_processing",
strategy="batch",
) as orch:
for fname in os.listdir("examples/documents/txts"):
with open(f"examples/documents/txts/{fname}", "r", encoding="utf-8") as f:
doc = f.read()
with orch.agent(fname) as agt:
out = summarizer_agent(agt, doc)
print(out.final_answer[:40] + "...")
[INFO] Resuming with session_id=6
The protagonist moves from a passive, he...
Winterbourne, the book’s central figure,...
Lucy Honeychurch begins as a sheltered, ...
Henry Jekyll begins as a respected, self...
Jay Gatsby begins as a penniless, aspiri...