from reducto import Reducto
client = Reducto(api_key=os.environ.get('REDUCTO_API_KEY'), timeout=300)
document_url = "https://support.adp.com/adp_payroll/content/hybrid/PDF/W2_Interactive.pdf"
# Step 1: Parse
parse_response = client.parse.run(document_url=document_url)
job_id = parse_response.job_id
# Step 2: Classify document type
classification = client.extract.run(
document_url=f"jobid://{job_id}",
schema={
"type": "object",
"properties": {
"document_type": {"type": "string", "enum": ["W2", "Passport", "Other"]}
},
"required": ["document_type"],
},
)
print(classification.result)
document_type = classification.result[0]["document_type"]
# Step 3: Choose schema based on classification
if document_type == "W2":
schema = {
"type": "object",
"properties": {
"total_wages": {"type": "number"},
"calendar_year": {"type": "integer"},
"employer_name": {"type": "string"}
},
"required": ["total_wages", "calendar_year", "employer_name"]
}
elif document_type == "Passport":
schema = {
"type": "object",
"properties": {
"passport_number": {"type": "string"},
"name": {"type": "string"},
"date_of_birth": {"type": "string"},
},
}
else:
raise ValueError(f"Unsupported document type: {document_type}")
# Step 4: Extract structured fields
extract_response = client.extract.run(
document_url=f"jobid://{job_id}",
schema=schema,
)
print(extract_response.result)