Skip to main content
ship before you scale

Document Management and File Validation

4 min read Chapter 29 of 42

Document Management and File Validation

The Feature

Vendors upload documents during the application process: insurance certificates, health permits, business licenses, and product photos. Each document is tracked in the database with metadata (filename, size, upload date, document type). Organizers can view and download all documents for a vendor’s application.

The Decision

Files are stored in R2 with a structured key format: vendors/{vendor_id}/{market_id}/{uuid}_{filename}. Metadata is stored in PostgreSQL. This separation means the database stays small (metadata only) while R2 handles the actual bytes. Listing files is a database query, not an R2 API call, which is faster and cheaper.

The Implementation

Document Model

# backend/app/models/document.py
import uuid
from datetime import datetime
from enum import Enum

from sqlalchemy import ForeignKey, String
from sqlalchemy.orm import Mapped, mapped_column, relationship

from app.database import Base


class DocumentType(str, Enum):
    INSURANCE = "insurance"
    HEALTH_PERMIT = "health_permit"
    BUSINESS_LICENSE = "business_license"
    PRODUCT_PHOTO = "product_photo"
    OTHER = "other"


class Document(Base):
    __tablename__ = "documents"

    id: Mapped[uuid.UUID] = mapped_column(primary_key=True, default=uuid.uuid4)
    vendor_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("vendors.id"))
    market_id: Mapped[uuid.UUID] = mapped_column(ForeignKey("markets.id"))
    application_id: Mapped[uuid.UUID] = mapped_column(
        ForeignKey("applications.id"), nullable=True
    )
    file_key: Mapped[str] = mapped_column(String(500))
    filename: Mapped[str] = mapped_column(String(255))
    file_size: Mapped[int]  # bytes
    content_type: Mapped[str] = mapped_column(String(100))
    document_type: Mapped[DocumentType]
    uploaded_at: Mapped[datetime] = mapped_column(default=datetime.utcnow)

File Validation

# backend/app/services/file_validation.py

ALLOWED_CONTENT_TYPES = {
    "application/pdf",
    "image/jpeg",
    "image/png",
    "image/webp",
}

MAX_FILE_SIZE = 10 * 1024 * 1024  # 10 MB

ALLOWED_EXTENSIONS = {".pdf", ".jpg", ".jpeg", ".png", ".webp"}


def validate_file(filename: str, file_size: int, content_type: str) -> list[str]:
    """Validate a file before generating an upload URL.
    Returns a list of error messages. Empty list means the file is valid."""
    errors: list[str] = []

    # Check extension
    ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
    if ext not in ALLOWED_EXTENSIONS:
        errors.append(
            f"File type '{ext}' is not allowed. "
            f"Accepted types: {', '.join(sorted(ALLOWED_EXTENSIONS))}"
        )

    # Check size
    if file_size > MAX_FILE_SIZE:
        max_mb = MAX_FILE_SIZE // (1024 * 1024)
        errors.append(f"File size exceeds the {max_mb} MB limit.")

    # Check content type
    if content_type not in ALLOWED_CONTENT_TYPES:
        errors.append(f"Content type '{content_type}' is not allowed.")

    return errors

Upload with Validation

# backend/app/routers/files.py (updated)
from fastapi import APIRouter, Depends, HTTPException
from pydantic import BaseModel

from app.services.file_validation import validate_file
from app.services.storage import StorageService

router = APIRouter(prefix="/api/files", tags=["files"])
storage = StorageService()


class UploadRequest(BaseModel):
    market_id: str
    filename: str
    file_size: int
    content_type: str
    document_type: str


@router.post("/upload-url")
async def get_upload_url(
    request: UploadRequest,
    vendor: Vendor = Depends(get_vendor_profile),
    db: AsyncSession = Depends(get_db),
):
    # Validate file
    errors = validate_file(request.filename, request.file_size, request.content_type)
    if errors:
        raise HTTPException(status_code=400, detail=errors)

    # Generate upload URL
    result = storage.generate_upload_url(
        market_id=request.market_id,
        vendor_id=str(vendor.id),
        filename=request.filename,
    )

    # Create document record (pending upload)
    document = Document(
        vendor_id=vendor.id,
        market_id=request.market_id,
        file_key=result["file_key"],
        filename=request.filename,
        file_size=request.file_size,
        content_type=request.content_type,
        document_type=request.document_type,
    )
    db.add(document)
    await db.commit()

    return {
        "upload_url": result["upload_url"],
        "file_key": result["file_key"],
        "document_id": str(document.id),
    }

Document Listing for Organizers

@router.get("/applications/{application_id}/documents")
async def list_application_documents(
    application_id: str,
    market: Market = Depends(get_market_for_organizer),
    db: AsyncSession = Depends(get_db),
):
    """List all documents for a vendor application.
    Only accessible by the market organizer."""
    result = await db.execute(
        select(Document)
        .where(Document.application_id == application_id)
        .where(Document.market_id == market.id)
        .order_by(Document.uploaded_at)
    )
    documents = result.scalars().all()

    return [
        {
            "id": str(doc.id),
            "filename": doc.filename,
            "file_size": doc.file_size,
            "document_type": doc.document_type,
            "uploaded_at": doc.uploaded_at.isoformat(),
            "download_url": storage.generate_download_url(doc.file_key),
        }
        for doc in documents
    ]

The Trap

# TRAP: Validating file type only by extension
filename = "malware.exe.pdf"  # Extension says PDF
# User renames any file to .pdf and uploads it

# SAFE: Validate both extension and content type
# The browser sends Content-Type based on the actual file
# Also enforce Content-Type in the pre-signed URL
url = self.s3.generate_presigned_url(
    "put_object",
    Params={
        "Bucket": self.bucket,
        "Key": key,
        "ContentType": content_type,  # R2 rejects if upload doesn't match
    },
    ExpiresIn=3600,
)

The pre-signed URL includes the expected Content-Type. Cloudflare R2 rejects the upload if the actual content type does not match. This prevents a user from renaming an executable to .pdf and uploading it. The browser also reports the true MIME type, adding a second layer of validation.

The Cost

ItemFree Tier Impact
R2 Class A operations (writes)1M free/month
R2 Class B operations (reads)10M free/month
PostgreSQL rows (metadata)Negligible

Each file upload uses 1 Class A operation (write) and 1 Class B operation (the pre-signed URL generation). Each document download uses 1 Class B operation. At 50 vendors uploading 5 documents each, that is 250 Class A operations per market season. Nowhere near the 1 million free tier limit.