from pathlib import Path
from reducto import Reducto
import pandas as pd
api_key = dbutils.secrets.get(scope="reducto", key="REDUCTO_API_KEY")
client = Reducto(api_key=api_key)
folder = Path("/Workspace/Users/Reducto/blood_test_results")
records = []
for blood_test in folder.iterdir():
upload = client.upload(file=blood_test)
parse_response = client.parse.run(
document_url=upload
options={
"ocr_mode": "standard",
"extraction_mode": "ocr",
"chunking": {"chunk_mode": "variable"},
"table_summary": {"enabled": False},
"figure_summary": {
"enabled": False,
"override": False
},
"force_url_result": False
}
)
# You can use this fully parsed response for something if needed!
job_id = parse_response.job_id
response = client.extract.run(
document_url=f"jobid://{job_id}",
system_prompt="Be precise and thorough. These are blood test results of varying page lengths and structures. Use visual layout cues such as bold labels, column alignment, and section dividers to interpret structure.",
options={
"ocr_mode": "standard",
"extraction_mode": "ocr",
"chunking": {"chunk_mode": "variable"},
"table_summary": {"enabled": False},
"figure_summary": {
"enabled": False,
"override": False
},
"force_url_result": False
},
schema={
"type": "object",
"properties": {
"patientName": {
"type": "string",
"description": "The full name of the patient."
},
"dateOfBirth": {
"type": "string",
"description": "The date of birth of the patient, formatted as YYYY-MM-DD."
},
"hemoglobinCount": {
"type": "number",
"description": "The hemoglobin count in the patient's blood, measured in grams per deciliter."
},
"redBloodCellCount": {
"type": "number",
"description": "The count of red blood cells in the patient's blood."
},
"whiteBloodCellCount": {
"type": "number",
"description": "The count of white blood cells in the patient's blood."
}
},
"required": [
"patientName",
"dateOfBirth",
"hemoglobinCount",
"redBloodCellCount",
"whiteBloodCellCount"
]
}
)