Document Management and File Validation
Document Management and File Validation
The Feature
Vendors upload documents during the application process: insurance certificates, health permits, business licenses, and product photos. Each document is tracked in the database with metadata (filename, size, upload date, document type). Organizers can view and download all documents for a vendor’s application.
The Decision
Files are stored in R2 with a structured key format: vendors/{vendor_id}/{market_id}/{uuid}_{filename}. Metadata is stored in PostgreSQL. This separation means the database stays small (metadata only) while R2 handles the actual bytes. Listing files is a database query, not an R2 API call, which is faster and cheaper.
The Implementation
Document Model
# backend/app/models/document.py
import uuid
from datetime import datetime
from enum import Enum
from sqlalchemy import ForeignKey, String
from sqlalchemy.orm import Mapped, mapped_column, relationship
from app.database import Base
class DocumentType(str, Enum):
INSURANCE = "insurance"
HEALTH_PERMIT = "health_permit"
BUSINESS_LICENSE = "business_license"
PRODUCT_PHOTO = "product_photo"
OTHER = "other"
class Document(Base):
__tablename__ = "documents"
id: Mapped[uuid.UUID] = mapped_column(primary_key=True, default=uuid.uuid4)
vendor_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("vendors.id"))
market_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("markets.id"))
application_id: Mapped[uuid.UUID] = mapped_column(
ForeignKey("applications.id"), nullable=True
)
file_key: Mapped[str] = mapped_column(String(500))
filename: Mapped[str] = mapped_column(String(255))
file_size: Mapped[int] # bytes
content_type: Mapped[str] = mapped_column(String(100))
document_type: Mapped[DocumentType]
uploaded_at: Mapped[datetime] = mapped_column(default=datetime.utcnow)
File Validation
# backend/app/services/file_validation.py
ALLOWED_CONTENT_TYPES = {
"application/pdf",
"image/jpeg",
"image/png",
"image/webp",
}
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10 MB
ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png", ".webp"}
def validate_file(filename: str, file_size: int, content_type: str) -> list[str]:
"""Validate a file before generating an upload URL.
Returns a list of error messages. Empty list means the file is valid."""
errors: list[str] = []
# Check extension
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
if ext not in ALLOWED_EXTENSIONS:
errors.append(
f"File type '{ext}' is not allowed. "
f"Accepted types: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
)
# Check size
if file_size > MAX_FILE_SIZE:
max_mb = MAX_FILE_SIZE // (1024 * 1024)
errors.append(f"File size exceeds the {max_mb} MB limit.")
# Check content type
if content_type not in ALLOWED_CONTENT_TYPES:
errors.append(f"Content type '{content_type}' is not allowed.")
return errors
Upload with Validation
# backend/app/routers/files.py (updated)
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel
from app.services.file_validation import validate_file
from app.services.storage import StorageService
router = APIRouter(prefix="/api/files", tags=["files"])
storage = StorageService()
class UploadRequest(BaseModel):
market_id: str
filename: str
file_size: int
content_type: str
document_type: str
@router.post("/upload-url")
async def get_upload_url(
request: UploadRequest,
vendor: Vendor = Depends(get_vendor_profile),
db: AsyncSession = Depends(get_db),
):
# Validate file
errors = validate_file(request.filename, request.file_size, request.content_type)
if errors:
raise HTTPException(status_code=400, detail=errors)
# Generate upload URL
result = storage.generate_upload_url(
market_id=request.market_id,
vendor_id=str(vendor.id),
filename=request.filename,
)
# Create document record (pending upload)
document = Document(
vendor_id=vendor.id,
market_id=request.market_id,
file_key=result["file_key"],
filename=request.filename,
file_size=request.file_size,
content_type=request.content_type,
document_type=request.document_type,
)
db.add(document)
await db.commit()
return {
"upload_url": result["upload_url"],
"file_key": result["file_key"],
"document_id": str(document.id),
}
Document Listing for Organizers
@router.get("/applications/{application_id}/documents")
async def list_application_documents(
application_id: str,
market: Market = Depends(get_market_for_organizer),
db: AsyncSession = Depends(get_db),
):
"""List all documents for a vendor application.
Only accessible by the market organizer."""
result = await db.execute(
select(Document)
.where(Document.application_id == application_id)
.where(Document.market_id == market.id)
.order_by(Document.uploaded_at)
)
documents = result.scalars().all()
return [
{
"id": str(doc.id),
"filename": doc.filename,
"file_size": doc.file_size,
"document_type": doc.document_type,
"uploaded_at": doc.uploaded_at.isoformat(),
"download_url": storage.generate_download_url(doc.file_key),
}
for doc in documents
]
The Trap
# TRAP: Validating file type only by extension
filename = "malware.exe.pdf" # Extension says PDF
# User renames any file to .pdf and uploads it
# SAFE: Validate both extension and content type
# The browser sends Content-Type based on the actual file
# Also enforce Content-Type in the pre-signed URL
url = self.s3.generate_presigned_url(
"put_object",
Params={
"Bucket": self.bucket,
"Key": key,
"ContentType": content_type, # R2 rejects if upload doesn't match
},
ExpiresIn=3600,
)
The pre-signed URL includes the expected Content-Type. Cloudflare R2 rejects the upload if the actual content type does not match. This prevents a user from renaming an executable to .pdf and uploading it. The browser also reports the true MIME type, adding a second layer of validation.
The Cost
| Item | Free Tier Impact |
|---|---|
| R2 Class A operations (writes) | 1M free/month |
| R2 Class B operations (reads) | 10M free/month |
| PostgreSQL rows (metadata) | Negligible |
Each file upload uses 1 Class A operation (write) and 1 Class B operation (the pre-signed URL generation). Each document download uses 1 Class B operation. At 50 vendors uploading 5 documents each, that is 250 Class A operations per market season. Nowhere near the 1 million free tier limit.