import os
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import json
import httpx
from reducto import Reducto
from reducto.types import ParseResponse
from reducto.types.shared.parse_response import ResultFullResult, ResultFullResultChunk, ResultFullResultChunkBlock
from reducto.lib.helpers import handle_url_response, FullParseResponse
# Initialize the Reducto client
client = Reducto(api_key=os.environ.get("REDUCTO_API_KEY"))
def parse_document(file_path: Path) -> FullParseResponse:
"""
Parse a document using Reducto and return the response.
Args:
file_path: Path to the document file
Returns:
The parse response containing chunks and blocks
"""
# Upload the document
upload = client.upload(file=file_path)
# Parse the document with chunking disabled to preserve block structure
response = client.parse.run(
input=upload,
retrieval={
"chunking": {
"chunk_mode": "disabled" # Important for maintaining block integrity
}
}
)
# Handle URL response if the result is too large
return handle_url_response(response)
def create_citation_content(response: FullParseResponse, file_path: Path) -> Tuple[List[Dict], Dict[int, Dict]]:
"""
Process parse response to create citation-numbered content and block metadata.
Args:
response: The Reducto parse response
file_path: Path to the source document
Returns:
Tuple containing:
- List of documents with citation-numbered content for vector storage
- Dictionary mapping citation numbers to block metadata
"""
# Extract chunks from the response
chunks = response.result.chunks
# Prepare documents for vector storage and citation mapping
vector_docs: List[Dict] = []
citation_map: Dict[int, Dict] = {}
citation_number = 1
for chunk_idx, chunk in enumerate(chunks):
# Use chunk.embed for vector embedding
embedding_text = chunk.embed
# Create citation-numbered content for LLM context
citation_content = ""
for block in chunk.blocks:
# Add citation number to content
if block.type != "Page Number": # Skip page numbers
citation_content += f"{block.content} [{citation_number}] "
# Store block metadata in citation map
citation_map[citation_number] = {
"type": block.type,
"content": block.content,
"bbox": {
"left": block.bbox.left,
"top": block.bbox.top,
"width": block.bbox.width,
"height": block.bbox.height,
"page": block.bbox.page
},
"chunk_idx": chunk_idx
}
citation_number += 1
# Create document for vector storage
vector_docs.append({
"id": f"chunk_{chunk_idx}",
"embedding_text": embedding_text, # For embedding generation
"citation_content": citation_content.strip(), # For LLM context
"metadata": {
"chunk_idx": chunk_idx,
"source": str(file_path)
}
})
return vector_docs, citation_map
def store_in_vector_db(vector_docs: List[Dict], citation_map: Dict[int, Dict]):
"""
Store documents in vector database with citation metadata.
This is a placeholder function - implement with your preferred vector DB.
Args:
vector_docs: List of documents with citation-numbered content
citation_map: Dictionary mapping citation numbers to block metadata
"""
# Example using a hypothetical vector DB client
# vector_db = VectorDBClient()
# Store citation map in a separate collection or as metadata
# vector_db.store_metadata("citation_map", citation_map)
# Store each document with its embedding
# for doc in vector_docs:
# vector_db.add_document(
# id=doc["id"],
# text=doc["embedding_text"], # For embedding generation
# metadata={
# **doc["metadata"],
# "citation_content": doc["citation_content"] # For LLM context
# }
# )
# For demonstration, we'll just print the first document and citation
print("Example Vector Document:")
print(json.dumps(vector_docs[0], indent=2))
print("\nExample Citation Mapping:")
print(json.dumps({1: citation_map[1]}, indent=2))
def rag_with_citations(query: str, citation_map: Dict[int, Dict]):
"""
Example RAG implementation with citation handling.
Args:
query: User query
citation_map: Dictionary mapping citation numbers to block metadata
"""
# 1. Retrieve relevant documents from vector DB (placeholder)
# relevant_docs = vector_db.search(query, top_k=3)
# 2. Construct LLM prompt with citation-numbered content
# prompt = f"""
# Question: {query}
#
# Context:
# {relevant_docs[0]['citation_content']}
# {relevant_docs[1]['citation_content']}
# {relevant_docs[2]['citation_content']}
#
# Answer the question based on the context above. Include citation numbers [X] in your response.
# """
# 3. Get LLM response (placeholder)
# llm_response = llm_client.generate(prompt)
# 4. Extract citations from response and map to source blocks
# Example LLM response: "The company was founded in 2005 [3] and has offices in New York [7]."
llm_response = "The company was founded in 2005 [3] and has offices in New York [7]."
# 5. Map citations to source blocks
cited_blocks = []
import re
citation_pattern = r'\[(\d+)\]'
for match in re.finditer(citation_pattern, llm_response):
citation_num = int(match.group(1))
if citation_num in citation_map:
cited_blocks.append(citation_map[citation_num])
# 6. Return response with mapped citations
return {
"response": llm_response,
"citations": cited_blocks
}
# Example usage
if __name__ == "__main__":
# Path to your document
file_path = Path("example_document.pdf")
# Parse document
response = parse_document(file_path)
# Create citation content and mapping
vector_docs, citation_map = create_citation_content(response, file_path)
# Store in vector DB
store_in_vector_db(vector_docs, citation_map)
# Example RAG query with citations
result = rag_with_citations("When was the company founded?", citation_map)
print("\nRAG Response with Citations:")
print(json.dumps(result, indent=2))