Skip to main content
aws in the trenches advanced cloud engineering for senior developers

GSI Overloading and Advanced Query Patterns

7 min read Chapter 5 of 21

GSI Overloading and Advanced Query Patterns

A Global Secondary Index (GSI) in DynamoDB is essentially a separate table maintained automatically by DynamoDB — with its own partition and sort key, its own throughput capacity, and its own storage. What makes GSIs powerful for single-table design is overloading: storing different entity types in the same GSI columns, where the prefix of the value determines the entity type and access pattern.

GSI Overloading: Multiple Access Patterns, One Index

The base table has pk and sk. You add columns like gsi1pk, gsi1sk, gsi2pk, gsi2sk — and populate them differently per entity type:

# Single table with overloaded GSI
#
# Base Table:
# pk            | sk                    | gsi1pk         | gsi1sk              | gsi2pk        | gsi2sk
# --------------|----------------------|----------------|---------------------|---------------|------------------
# USER#123      | PROFILE              | EMAIL#[email protected]  | USER#123            | —             | —
# USER#123      | ORDER#2024-01#ord1   | ORDER#ord1     | METADATA            | STATUS#SHIPPED| 2024-01-15
# ORDER#ord1    | ITEM#sku-abc         | PRODUCT#sku-abc| ORDER#ord1          | —             | —
# PRODUCT#abc   | METADATA             | CATEGORY#elec  | PRODUCT#abc         | PRICE         | 00000029.99
# PRODUCT#abc   | REVIEW#user123       | —              | —                   | RATING#5      | 2024-01-10

# Access patterns served:
# Base table pk/sk → Get user profile, get user orders, get order items, get product
# GSI1 pk/sk      → Get user by email, get order metadata, get orders containing a product
# GSI2 pk/sk      → Get orders by status (sorted by date), products by price, reviews by rating

import boto3
from boto3.dynamodb.conditions import Key
from decimal import Decimal

dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('app-table')

def create_product(sku: str, name: str, category: str, price: float):
    """Store product with GSI entries for category browsing and price sorting."""
    table.put_item(Item={
        'pk': f'PRODUCT#{sku}',
        'sk': 'METADATA',
        'entity_type': 'Product',
        'name': name,
        'category': category,
        'price': Decimal(str(price)),
        # GSI1: Browse by category
        'gsi1pk': f'CATEGORY#{category}',
        'gsi1sk': f'PRODUCT#{sku}',
        # GSI2: Sort by price within category (zero-padded for lexicographic sort)
        'gsi2pk': f'CATEGORY#{category}',
        'gsi2sk': f'PRICE#{price:010.2f}'
    })

def get_products_by_category_sorted_by_price(category: str, ascending=True, limit=20):
    """Query GSI2: products in a category sorted by price."""
    response = table.query(
        IndexName='gsi2-index',
        KeyConditionExpression=Key('gsi2pk').eq(f'CATEGORY#{category}')
                              & Key('gsi2sk').begins_with('PRICE#'),
        ScanIndexForward=ascending,
        Limit=limit
    )
    return response['Items']

def get_orders_by_status(status: str, start_date: str = None, limit=50):
    """Query GSI2: orders filtered by status, sorted by date."""
    key_condition = Key('gsi2pk').eq(f'STATUS#{status}')
    if start_date:
        key_condition = key_condition & Key('gsi2sk').gte(start_date)

    response = table.query(
        IndexName='gsi2-index',
        KeyConditionExpression=key_condition,
        ScanIndexForward=False,  # Most recent first
        Limit=limit
    )
    return response['Items']
import software.amazon.awssdk.services.dynamodb.DynamoDbClient;
import software.amazon.awssdk.services.dynamodb.model.*;
import java.util.*;

public class GsiOverloading {

    private final DynamoDbClient dynamo = DynamoDbClient.create();
    private static final String TABLE = "app-table";

    public void createProduct(String sku, String name, String category, double price) {
        String priceKey = String.format("PRICE#%010.2f", price);

        dynamo.putItem(PutItemRequest.builder()
            .tableName(TABLE)
            .item(Map.of(
                "pk", attr("PRODUCT#" + sku),
                "sk", attr("METADATA"),
                "entity_type", attr("Product"),
                "name", attr(name),
                "category", attr(category),
                "price", numAttr(String.valueOf(price)),
                "gsi1pk", attr("CATEGORY#" + category),
                "gsi1sk", attr("PRODUCT#" + sku),
                "gsi2pk", attr("CATEGORY#" + category),
                "gsi2sk", attr(priceKey)
            ))
            .build());
    }

    public List<Map<String, AttributeValue>> getProductsByCategoryAndPrice(
            String category, boolean ascending, int limit) {
        return dynamo.query(QueryRequest.builder()
            .tableName(TABLE)
            .indexName("gsi2-index")
            .keyConditionExpression("gsi2pk = :pk AND begins_with(gsi2sk, :prefix)")
            .expressionAttributeValues(Map.of(
                ":pk", attr("CATEGORY#" + category),
                ":prefix", attr("PRICE#")
            ))
            .scanIndexForward(ascending)
            .limit(limit)
            .build())
            .items();
    }

    private AttributeValue attr(String s) {
        return AttributeValue.builder().s(s).build();
    }
    private AttributeValue numAttr(String n) {
        return AttributeValue.builder().n(n).build();
    }
}

Sparse Indexes: The Hidden Filtering Mechanism

A GSI only contains items that have values for the GSI’s key attributes. If you leave gsi1pk empty (don’t include the attribute), the item doesn’t appear in the GSI. This creates a sparse index — a filtered view of your table.

# Use case: "Get all flagged orders that need review"
# Instead of scanning the entire table and filtering, use a sparse GSI

def flag_order_for_review(order_id: str, reason: str):
    """Adding the GSI key attribute causes the item to appear in the sparse index."""
    table.update_item(
        Key={'pk': f'ORDER#{order_id}', 'sk': 'METADATA'},
        UpdateExpression='SET review_gsi_pk = :pk, review_gsi_sk = :sk, flag_reason = :reason',
        ExpressionAttributeValues={
            ':pk': 'FLAGGED_ORDERS',
            ':sk': f'{order_id}',
            ':reason': reason
        }
    )

def unflag_order(order_id: str):
    """Removing the GSI key attribute removes the item from the sparse index."""
    table.update_item(
        Key={'pk': f'ORDER#{order_id}', 'sk': 'METADATA'},
        UpdateExpression='REMOVE review_gsi_pk, review_gsi_sk, flag_reason'
    )

def get_all_flagged_orders():
    """Query the sparse GSI — only flagged items exist here."""
    response = table.query(
        IndexName='review-gsi',
        KeyConditionExpression=Key('review_gsi_pk').eq('FLAGGED_ORDERS')
    )
    return response['Items']  # Only flagged orders, no scan needed

Projection Strategy and Cost Analysis

When you create a GSI, you choose what attributes to project (copy) into the index:

ProjectionStorage CostRead CostUse Case
KEYS_ONLYMinimal (keys + base table keys)Must fetch from base table for other attrsExistence checks, ID lookups
INCLUDEKeys + specified attributesOnly fetches base table for non-included attrsKnown query result shapes
ALLFull item duplicationNever needs base table fetchFlexible queries, unknown future needs

The cost trap: GSI writes consume WCU from the GSI’s allocation (separate from base table). If you project ALL, every base table write also writes the full item to every GSI. With 3 GSIs projecting ALL, your effective write cost is 4x.

# Cost calculation example:
# Table: 1,000 writes/sec, average item 2KB
# GSI1 (ALL projection): 1,000 writes/sec × 2KB = 2,000 WCU
# GSI2 (ALL projection): 1,000 writes/sec × 2KB = 2,000 WCU
# Base table: 1,000 writes/sec × 2KB = 2,000 WCU
# Total: 6,000 WCU (3x the base table alone!)

# Optimization: Use INCLUDE projection with only the attributes needed for query results
# GSI1 (INCLUDE: name, status): ~500 bytes projected = 500 WCU
# GSI2 (INCLUDE: date, amount): ~200 bytes projected = 200 WCU
# Total: 2,700 WCU (55% reduction from ALL projection)

Pagination Done Right

DynamoDB pagination uses ExclusiveStartKey — an opaque token that tells DynamoDB where to resume. Common mistakes: trying to implement “jump to page 5” (impossible without scanning pages 1-4), or leaking internal key structure to API clients.

import base64
import json

def paginate_orders(user_id: str, page_size: int = 20, cursor: str = None):
    """
    Cursor-based pagination suitable for API responses.
    The cursor is an opaque base64 token hiding the DynamoDB key structure.
    """
    query_params = {
        'KeyConditionExpression': Key('pk').eq(f'USER#{user_id}') & Key('sk').begins_with('ORDER#'),
        'ScanIndexForward': False,
        'Limit': page_size
    }

    if cursor:
        # Decode the cursor back to DynamoDB's ExclusiveStartKey
        exclusive_start_key = json.loads(base64.b64decode(cursor))
        query_params['ExclusiveStartKey'] = exclusive_start_key

    response = table.query(**query_params)

    # Build next cursor from LastEvaluatedKey
    next_cursor = None
    if 'LastEvaluatedKey' in response:
        next_cursor = base64.b64encode(
            json.dumps(response['LastEvaluatedKey']).encode()
        ).decode()

    return {
        'items': response['Items'],
        'next_cursor': next_cursor,
        'has_more': next_cursor is not None
    }
import software.amazon.awssdk.services.dynamodb.model.*;
import java.util.*;
import java.util.Base64;
import com.fasterxml.jackson.databind.ObjectMapper;

public class DynamoPagination {

    private final DynamoDbClient dynamo = DynamoDbClient.create();
    private final ObjectMapper mapper = new ObjectMapper();

    public record Page(
        List<Map<String, AttributeValue>> items,
        String nextCursor,
        boolean hasMore
    ) {}

    public Page getOrders(String userId, int pageSize, String cursor) throws Exception {
        QueryRequest.Builder requestBuilder = QueryRequest.builder()
            .tableName("app-table")
            .keyConditionExpression("pk = :pk AND begins_with(sk, :prefix)")
            .expressionAttributeValues(Map.of(
                ":pk", AttributeValue.builder().s("USER#" + userId).build(),
                ":prefix", AttributeValue.builder().s("ORDER#").build()
            ))
            .scanIndexForward(false)
            .limit(pageSize);

        if (cursor != null && !cursor.isEmpty()) {
            // Decode cursor to ExclusiveStartKey
            String decoded = new String(Base64.getDecoder().decode(cursor));
            Map<String, String> keyMap = mapper.readValue(decoded, Map.class);
            Map<String, AttributeValue> startKey = new HashMap<>();
            keyMap.forEach((k, v) -> startKey.put(k,
                AttributeValue.builder().s(v).build()));
            requestBuilder.exclusiveStartKey(startKey);
        }

        QueryResponse response = dynamo.query(requestBuilder.build());

        String nextCursor = null;
        if (response.lastEvaluatedKey() != null && !response.lastEvaluatedKey().isEmpty()) {
            Map<String, String> keyMap = new HashMap<>();
            response.lastEvaluatedKey().forEach((k, v) -> keyMap.put(k, v.s()));
            nextCursor = Base64.getEncoder().encodeToString(
                mapper.writeValueAsBytes(keyMap));
        }

        return new Page(response.items(), nextCursor, nextCursor != null);
    }
}

Warning: Never expose raw LastEvaluatedKey in your API. It contains your internal key structure (pk, sk values) which reveals your data model to clients and could be manipulated to access other users’ data if your key structure encodes user IDs.