paychex · paychex-joser · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025
diff --git a/.github/workflows/images.yaml b/.github/workflows/images.yaml
@@ -25,6 +25,18 @@ jobs:
             image_name: librechat-rag-api-dev-lite
 
     steps:
+      # Free up disk space
+      - name: Free Disk Space
+        uses: jlumbroso/free-disk-space@main
+        with:
+          tool-cache: true
+          android: true
+          dotnet: true
+          haskell: true
+          large-packages: true
+          docker-images: true
+          swap-storage: true
+
       # Check out the repository
       - name: Checkout
         uses: actions/checkout@v4
@@ -57,3 +69,5 @@ jobs:
             ghcr.io/${{ github.repository_owner }}/${{ matrix.image_name }}:latest
           platforms: linux/amd64,linux/arm64
           target: ${{ matrix.target }}
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
diff --git a/.gitignore b/.gitignore
@@ -9,3 +9,7 @@ venv/
 *.pyc
 dev.yml
 SHOPIFY.md
+
+# docker override file
+docker-compose.override.yaml
+docker-compose.override.yml
diff --git a/Dockerfile b/Dockerfile
@@ -15,7 +15,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
 # Download standard NLTK data, to prevent unstructured from downloading packages at runtime
-RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger
+RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng
 ENV NLTK_DATA=/app/nltk_data
 
 # Disable Unstructured analytics

diff --git a/Dockerfile.lite b/Dockerfile.lite
@@ -15,7 +15,7 @@ COPY requirements.lite.txt .
 RUN pip install --no-cache-dir -r requirements.lite.txt
 
 # Download standard NLTK data, to prevent unstructured from downloading packages at runtime
-RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger
+RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng
 ENV NLTK_DATA=/app/nltk_data
 
 # Disable Unstructured analytics

diff --git a/PaychexDockerfile b/PaychexDockerfile
@@ -2,8 +2,6 @@ FROM python:3.12-slim AS main
 
 WORKDIR /app
 
-WORKDIR /app
-
 # Install pandoc and netcat
 RUN apt-get update \
     && apt-get install -y --no-install-recommends \
@@ -17,7 +15,7 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 
 # Download standard NLTK data, to prevent unstructured from downloading packages at runtime
-RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger
+RUN python -m nltk.downloader -d /app/nltk_data punkt_tab averaged_perceptron_tagger averaged_perceptron_tagger_eng
 ENV NLTK_DATA=/app/nltk_data
 
 # Disable Unstructured analytics

diff --git a/README.md b/README.md
@@ -34,6 +34,33 @@ pip install -r requirements.txt
 uvicorn main:app
 ```
 
+### Clean Install (Local Development)
+
+To do a clean reinstall of all dependencies (e.g., after updating `requirements.txt`):
+
+```bash
+# Remove existing virtual environment and recreate it
+rm -rf venv
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+For the lite version (without sentence_transformers/huggingface):
+
+```bash
+rm -rf venv
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.lite.txt
+```
+
+For Docker, rebuild without cache:
+
+```bash
+docker compose build --no-cache
+```
+
 ### Environment Variables
 
 The following environment variables are required to run the application:
@@ -59,6 +86,8 @@ The following environment variables are required to run the application:
 - `COLLECTION_NAME`: (Optional) The name of the collection in the vector store. Default value is "testcollection".
 - `CHUNK_SIZE`: (Optional) The size of the chunks for text processing. Default value is "1500".
 - `CHUNK_OVERLAP`: (Optional) The overlap between chunks during text processing. Default value is "100".
+- `EMBEDDING_BATCH_SIZE`: (Optional) Number of document chunks to process per batch. Set to `0` (default) to disable batching. Recommended value is `750` for `text-embedding-3-small`.
+- `EMBEDDING_MAX_QUEUE_SIZE`: (Optional) Maximum number of batches to buffer in memory during async processing. Default value is "3".
 - `RAG_UPLOAD_DIR`: (Optional) The directory where uploaded files are stored. Default value is "./uploads/".
 - `PDF_EXTRACT_IMAGES`: (Optional) A boolean value indicating whether to extract images from PDF files. Default value is "False".
 - `DEBUG_RAG_API`: (Optional) Set to "True" to show more verbose logging output in the server console, and to enable postgresql database routes
@@ -71,7 +100,7 @@ The following environment variables are required to run the application:
     - azure: "text-embedding-3-small" (will be used as your Azure Deployment)
     - huggingface: "sentence-transformers/all-MiniLM-L6-v2"
     - huggingfacetei: "http://huggingfacetei:3000". Hugging Face TEI uses model defined on TEI service launch.
-    - vertexai: "text-embedding-004"
+    - vertexai: "gemini-embedding-001"
     - ollama: "nomic-embed-text"
     - bedrock: "amazon.titan-embed-text-v1"
     - google_genai: "gemini-embedding-001"
@@ -90,11 +119,48 @@ The following environment variables are required to run the application:
 - `AWS_SECRET_ACCESS_KEY`: (Optional) needed for bedrock embeddings
 - `GOOGLE_API_KEY`, `GOOGLE_KEY`, `RAG_GOOGLE_API_KEY`: (Optional) Google API key for Google GenAI embeddings. Priority order: RAG_GOOGLE_API_KEY > GOOGLE_KEY > GOOGLE_API_KEY
 - `AWS_SESSION_TOKEN`: (Optional) may be needed for bedrock embeddings
-- `GOOGLE_APPLICATION_CREDENTIALS`: (Optional) needed for Google VertexAI embeddings. This should be a path to a service account credential file in JSON format, as accepted by [langchain](https://python.langchain.com/api_reference/google_vertexai/index.html)
+- `GOOGLE_APPLICATION_CREDENTIALS`: (Optional) needed for Google VertexAI embeddings. This should be a path to a service account credential file in JSON format.
+- `GOOGLE_CLOUD_PROJECT`: (Optional) Google Cloud project ID, needed for VertexAI embeddings.
+- `GOOGLE_CLOUD_LOCATION`: (Optional) Google Cloud region for VertexAI embeddings. Defaults to `us-central1`.
 - `RAG_CHECK_EMBEDDING_CTX_LENGTH` (Optional) Default is true, disabling this will send raw input to the embedder, use this for custom embedding models.
 
 Make sure to set these environment variables before running the application. You can set them in a `.env` file or as system environment variables.
 
+### Embedding Batch Processing
+
+For large files, you can enable batched embedding processing to reduce memory consumption. This is particularly useful in memory-constrained environments like Kubernetes pods with memory limits.
+
+#### Configuration
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `EMBEDDING_BATCH_SIZE` | `0` | Number of document chunks to process per batch. `0` disables batching (original behavior). |
+| `EMBEDDING_MAX_QUEUE_SIZE` | `3` | Maximum number of batches to buffer in memory during async processing. |
+
+#### Recommended Settings
+
+For `text-embedding-3-small` model:
+- `EMBEDDING_BATCH_SIZE=750` - Good balance of throughput and memory
+
+For memory-constrained environments (< 2GB RAM):
+- `EMBEDDING_BATCH_SIZE=100-250`
+
+For high-throughput environments:
+- `EMBEDDING_BATCH_SIZE=1000-2000`
+- `EMBEDDING_MAX_QUEUE_SIZE=5`
+
+#### Behavior
+
+When `EMBEDDING_BATCH_SIZE > 0`:
+- Documents are processed in batches of the specified size
+- Each batch is embedded and inserted before the next batch starts
+- On failure, successfully inserted documents are rolled back
+- Memory usage is bounded by `EMBEDDING_BATCH_SIZE * EMBEDDING_MAX_QUEUE_SIZE`
+
+When `EMBEDDING_BATCH_SIZE = 0` (default):
+- All documents are processed at once (original behavior)
+- Better for small files or memory-rich environments
+
 ### Use Atlas MongoDB as Vector Database
 
 Instead of using the default pgvector, we could use [Atlas MongoDB](https://www.mongodb.com/products/platform/atlas-vector-search) as the vector database. To do so, set the following environment variables
@@ -127,6 +193,16 @@ The `ATLAS_MONGO_DB_URI` could be the same or different from what is used by Lib
 
 Follow one of the [four documented methods](https://www.mongodb.com/docs/atlas/atlas-vector-search/create-index/#procedure) to create the vector index.
 
+#### Create a `file_id` Index (recommended)
+
+We recommend creating a standard MongoDB index on `file_id` to keep lookups fast. After creating the collection, run the following once (via Atlas UI, Compass, or `mongosh`):
+
+```javascript
+db.getCollection("<COLLECTION_NAME>").createIndex({ file_id: 1 })
+```
+
+Replace `<COLLECTION_NAME>` with the same collection used by the RAG API. This ensures lookups remain fast even as the number of embedded documents grows.
+
 
 ### Proxy Configuration
 
@@ -169,6 +245,81 @@ Notes:
 
 ### Dev notes:
 
+#### Running Tests
+
+##### Prerequisites
+
+Install test dependencies:
+
+```bash
+pip install -r test_requirements.txt
+```
+
+##### Running All Tests
+
+```bash
+# Run all tests
+pytest
+
+# Run with verbose output
+pytest -v
+
+# Run with coverage (if pytest-cov is installed)
+pytest --cov=app
+```
+
+##### Running Specific Test Files
+
+```bash
+# Run batch processing unit tests
+pytest tests/test_batch_processing.py -v
+
+# Run batch processing integration tests (memory optimization tests)
+pytest tests/test_batch_processing_integration.py -v
+
+# Run main API tests
+pytest tests/test_main.py -v
+```
+
+##### Running Tests by Category
+
+```bash
+# Run only integration tests (marked with @pytest.mark.integration)
+pytest -m integration -v
+
+# Skip integration tests
+pytest -m "not integration" -v
+
+# Run only async tests
+pytest -k "async" -v
+```
+
+##### Test Categories
+
+| Test File | Description |
+|-----------|-------------|
+| `test_batch_processing.py` | Unit tests for batch processing functions |
+| `test_batch_processing_integration.py` | Memory optimization and integration tests |
+| `test_main.py` | API endpoint tests |
+| `test_config.py` | Configuration tests |
+| `test_middleware.py` | Middleware tests |
+| `test_models.py` | Model tests |
+
+##### Memory Optimization Tests
+
+The `test_batch_processing_integration.py` file includes tests that verify the memory optimization behavior:
+
+- **`test_memory_bounded_by_batch_size`**: Verifies that the number of documents in memory at any time is bounded by `EMBEDDING_BATCH_SIZE`
+- **`test_memory_tracking_with_tracemalloc`**: Uses Python's `tracemalloc` to monitor memory usage during batch processing
+- **`test_sync_memory_bounded_by_batch_size`**: Same verification for the synchronous code path
+
+Run memory tests specifically:
+
+```bash
+pytest tests/test_batch_processing_integration.py::TestMemoryOptimization -v
+pytest tests/test_batch_processing_integration.py::TestSyncBatchedMemory -v
+```
+
 #### Installing pre-commit formatter
 
 Run the following commands to install pre-commit formatter, which uses [black](https://github.com/psf/black) code formatter:

diff --git a/app/config.py b/app/config.py
@@ -71,6 +71,23 @@ def get_env_variable(
 CHUNK_SIZE = int(get_env_variable("CHUNK_SIZE", "1500"))
 CHUNK_OVERLAP = int(get_env_variable("CHUNK_OVERLAP", "100"))
 
+# Batch processing configuration for memory-constrained environments.
+# When EMBEDDING_BATCH_SIZE > 0, documents are processed in batches to reduce
+# peak memory usage. This is useful for Kubernetes pods with memory limits.
+#
+# Trade-offs:
+# - Smaller batch size = lower memory, more DB round trips
+# - Larger batch size = higher memory, fewer DB round trips
+# - 0 = disable batching, process all at once
+#
+# Default of 500 is conservative and works well for most embedding providers.
+# Increase to 750 for higher throughput at the cost of higher peak memory.
+EMBEDDING_BATCH_SIZE = int(get_env_variable("EMBEDDING_BATCH_SIZE", "500"))
+
+# Maximum number of batches to buffer in memory during async processing.
+# Higher values allow more parallelism but use more memory.
+EMBEDDING_MAX_QUEUE_SIZE = int(get_env_variable("EMBEDDING_MAX_QUEUE_SIZE", "3"))
+
 env_value = get_env_variable("PDF_EXTRACT_IMAGES", "False").lower()
 PDF_EXTRACT_IMAGES = True if env_value == "true" else False
 
@@ -241,12 +258,18 @@ def init_embeddings(provider, model):
 
         return GoogleGenerativeAIEmbeddings(
             model=model,
-            google_api_key=RAG_GOOGLE_API_KEY,
+            google_api_key=RAG_GOOGLE_API_KEY or None,
         )
     elif provider == EmbeddingsProvider.GOOGLE_VERTEXAI:
-        from langchain_google_vertexai import VertexAIEmbeddings
+        from langchain_google_genai import GoogleGenerativeAIEmbeddings
 
-        return VertexAIEmbeddings(model=model)
+        return GoogleGenerativeAIEmbeddings(
+            model=model,
+            google_api_key=RAG_GOOGLE_API_KEY or None,
+            vertexai=True,
+            project=get_env_variable("GOOGLE_CLOUD_PROJECT", None),
+            location=get_env_variable("GOOGLE_CLOUD_LOCATION", "us-central1"),
+        )
     elif provider == EmbeddingsProvider.BEDROCK:
         from langchain_aws import BedrockEmbeddings
 
@@ -290,7 +313,7 @@ def init_embeddings(provider, model):
         "EMBEDDINGS_MODEL", "http://huggingfacetei:3000"
     )
 elif EMBEDDINGS_PROVIDER == EmbeddingsProvider.GOOGLE_VERTEXAI:
-    EMBEDDINGS_MODEL = get_env_variable("EMBEDDINGS_MODEL", "text-embedding-004")
+    EMBEDDINGS_MODEL = get_env_variable("EMBEDDINGS_MODEL", "gemini-embedding-001")
 elif EMBEDDINGS_PROVIDER == EmbeddingsProvider.OLLAMA:
     EMBEDDINGS_MODEL = get_env_variable("EMBEDDINGS_MODEL", "nomic-embed-text")
 elif EMBEDDINGS_PROVIDER == EmbeddingsProvider.GOOGLE_GENAI: