From 255b928192efe85879259755f97c8084563674bd Mon Sep 17 00:00:00 2001 From: nicepopo86-lang Date: Tue, 17 Feb 2026 06:08:41 +0000 Subject: [PATCH] Add incremental RFC notes and benchmark helper for join key prefiltering --- docs/dev/join-key-prefiltering-rfc-notes.md | 80 +++++++++++++++++++++ docs/dev/queries/join-key-prefiltering.ppl | 6 ++ scripts/benchmark_join_prefilter.sh | 48 +++++++++++++ 3 files changed, 134 insertions(+) create mode 100644 docs/dev/join-key-prefiltering-rfc-notes.md create mode 100644 docs/dev/queries/join-key-prefiltering.ppl create mode 100755 scripts/benchmark_join_prefilter.sh diff --git a/docs/dev/join-key-prefiltering-rfc-notes.md b/docs/dev/join-key-prefiltering-rfc-notes.md new file mode 100644 index 0000000000..e8aa2b4555 --- /dev/null +++ b/docs/dev/join-key-prefiltering-rfc-notes.md @@ -0,0 +1,80 @@ +# Join Key Pre-filtering: Incremental RFC Notes (Issue #5093) + +This document turns the RFC discussion into an incremental, reviewable implementation plan. + +## Problem Recap + +For sparse star-join workloads, the current join path can still scan most/all of the fact table even when the dimension-side filters are very selective. + +Example shape: + +```ppl +source=request_logs +| lookup dim_lookup host_key append service_name, environment, region +| where _lookup = "host" +| where service_name = "payment-service" +| head 10 +| fields request_id, region; +``` + +Current behavior often results in: +1. filtered scan of `dim_lookup` +2. broad scan of `request_logs` +3. hash join + late `head` + +## Incremental Plan + +### Phase 1 (safe, narrow) + +Apply key pre-filtering only when all conditions below are met: + +- join type is effectively `inner` after predicates +- join condition is a single equality key (`fact.key = dim.key`) +- dimension side has pushdown-safe filters +- dimension key cardinality is below threshold (configurable) + +Execution sketch: + +1. evaluate/pushdown dimension-side predicates first +2. materialize matching join keys (bounded set) +3. inject fact-side `terms` filter on join key +4. continue existing join projection path + +### Phase 2 (broader coverage) + +- add support for selected left joins where null-preserving semantics are unchanged +- add adaptive thresholding based on planner stats / max terms size +- early-limit strategies for top-k style pipelines where semantically safe + +## Correctness Guardrails + +- Never apply optimization if any semantic ambiguity exists. +- Preserve existing behavior for unsupported query shapes. +- Keep optimization behind a feature flag in initial rollout. + +## Suggested Config Knobs + +- `plugins.sql.optimization.joinKeyPrefilter.enabled` (default: false) +- `plugins.sql.optimization.joinKeyPrefilter.maxKeys` (default: 10_000) +- `plugins.sql.optimization.joinKeyPrefilter.maxBytes` (default: bounded) + +## Validation Matrix + +1. **Correctness tests** + - result equivalence against baseline for supported shapes + - no change for unsupported shapes +2. **Performance tests** + - sparse star-join fixture (expect substantial speedup) + - dense/non-selective joins (expect neutral/slight overhead) +3. **Safety tests** + - large keyset guard (optimizer must bail out) + +## Benchmark Reproduction Aid + +Use the helper script: + +```bash +scripts/benchmark_join_prefilter.sh +``` + +By default this calls `POST /_plugins/_ppl` repeatedly and reports timing. See script usage for custom endpoint/query and hyperfine integration. diff --git a/docs/dev/queries/join-key-prefiltering.ppl b/docs/dev/queries/join-key-prefiltering.ppl new file mode 100644 index 0000000000..09a6067121 --- /dev/null +++ b/docs/dev/queries/join-key-prefiltering.ppl @@ -0,0 +1,6 @@ +source=request_logs +| lookup dim_lookup host_key append _lookup, service_name, environment, region +| where _lookup = "host" +| where service_name = "payment-service" +| head 10 +| fields request_id, host_key, region; diff --git a/scripts/benchmark_join_prefilter.sh b/scripts/benchmark_join_prefilter.sh new file mode 100755 index 0000000000..299d79d59c --- /dev/null +++ b/scripts/benchmark_join_prefilter.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Benchmark helper for RFC #5093 (join key pre-filtering). +# +# Usage: +# scripts/benchmark_join_prefilter.sh [QUERY_FILE] +# +# Env vars: +# OS_ENDPOINT (default: http://localhost:9200) +# RUNS (default: 10) +# WARMUP (default: 2) +# USE_HYPERFINE (default: 1; set 0 to use curl loop) + +QUERY_FILE="${1:-docs/dev/queries/join-key-prefiltering.ppl}" +OS_ENDPOINT="${OS_ENDPOINT:-http://localhost:9200}" +RUNS="${RUNS:-10}" +WARMUP="${WARMUP:-2}" +USE_HYPERFINE="${USE_HYPERFINE:-1}" + +if [[ ! -f "$QUERY_FILE" ]]; then + echo "Query file not found: $QUERY_FILE" >&2 + exit 1 +fi + +URL="${OS_ENDPOINT%/}/_plugins/_ppl" + +if [[ "$USE_HYPERFINE" == "1" ]] && command -v hyperfine >/dev/null 2>&1; then + hyperfine \ + --warmup "$WARMUP" \ + --runs "$RUNS" \ + "curl -sS -XPOST '$URL' --data-binary @$QUERY_FILE -H 'content-type: text/plain' >/dev/null" + exit 0 +fi + +echo "hyperfine not available (or disabled). Falling back to curl loop..." +for i in $(seq 1 "$WARMUP"); do + curl -sS -XPOST "$URL" --data-binary @"$QUERY_FILE" -H 'content-type: text/plain' >/dev/null +done + +start=$(date +%s) +for i in $(seq 1 "$RUNS"); do + curl -sS -XPOST "$URL" --data-binary @"$QUERY_FILE" -H 'content-type: text/plain' >/dev/null +done +end=$(date +%s) + +elapsed=$((end - start)) +echo "runs=$RUNS elapsed=${elapsed}s avg=$((elapsed / RUNS))s"