Skip to content

ci(publish): fix Soda PyPI --skip-existing + single approval gate #2473

ci(publish): fix Soda PyPI --skip-existing + single approval gate

ci(publish): fix Soda PyPI --skip-existing + single approval gate #2473

Workflow file for this run

---
name: CI pipeline
on:
workflow_dispatch:
inputs:
dataSource:
description: Run tests for this Soda data source
type: choice
default: all
options:
- all
- athena
- bigquery
- databricks
- duckdb
- fabric
- postgres
- redshift
- snowflake
- sqlserver
- synapse
- sparkdf
- trino-postgres
- trino-s3
includeNightlyTests:
description: Include nightly-only tests (slow / rarely changing)
type: boolean
default: false
push:
branches:
- 'main'
# NOTE: Tag-triggered production releases are handled by release.yaml.
# This workflow only runs CI + dev-pypi publishes for main branch pushes.
pull_request:
env:
PROJECT_NAME: soda-core
jobs:
check:
name: pre-commit & lockfile
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Set up UV
uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
- name: Check lockfile is up-to-date
run: uv lock --check
- name: Run pre-commit
uses: pre-commit/action@v3.0.1
detect-skip-tests:
name: detect [SKIP-TESTS]
runs-on: ubuntu-24.04
outputs:
skip: ${{ steps.check.outputs.skip }}
steps:
- uses: actions/checkout@v4
- name: Check commit message for [SKIP-TESTS] prefix
id: check
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin "${{ github.event.pull_request.head.sha }}" --depth=1
COMMIT_MSG=$(git log -1 --pretty=%s FETCH_HEAD)
else
COMMIT_MSG=$(git log -1 --pretty=%s)
fi
if [[ "$COMMIT_MSG" == "[SKIP-TESTS]"* ]]; then
echo "skip=true" >> "$GITHUB_OUTPUT"
echo "Commit message starts with [SKIP-TESTS] — tests will be skipped"
else
echo "skip=false" >> "$GITHUB_OUTPUT"
fi
define-test-matrix:
if: needs.detect-skip-tests.outputs.skip != 'true'
runs-on: ubuntu-24.04
needs: [check, detect-skip-tests]
outputs:
modules: ${{ steps.modules.outputs.modules }}
steps:
- uses: actions/checkout@v4
- name: Set data source (workflow_dispatch)
if: ${{ github.event_name == 'workflow_dispatch' }}
run: |
if [ -z "${{ inputs.dataSource }}" ] || [ "${{ inputs.dataSource }}" = "all" ]; then
echo "DATA_SOURCE=all" >> "${GITHUB_ENV}"
else
echo "DATA_SOURCE=${{ inputs.dataSource }}" >> "${GITHUB_ENV}"
fi
- name: Set data source (push)
if: ${{ github.event_name != 'workflow_dispatch' }}
run: |
echo "DATA_SOURCE=all" >> "${GITHUB_ENV}"
- name: Define modules
id: modules
run: |
echo "INFO: DATA_SOURCE is set to ${DATA_SOURCE}"
if [ "${DATA_SOURCE}" = "all" ]; then
echo modules=$(bash scripts/test_matrix.sh) >> "$GITHUB_OUTPUT"
else
echo 'modules=["__DATA_SOURCE__"]' | sed "s|__DATA_SOURCE__|${DATA_SOURCE}|g" >> "$GITHUB_OUTPUT"
fi
test:
runs-on: ubuntu-24.04
needs: [define-test-matrix]
services:
postgres:
# please keep the postgres version in sync with the one in docker-compose.yml for postgres
# Also used by trino-postgres (Trino's PostgreSQL catalog connector targets this instance)
image: ${{ ( matrix.module == 'postgres' || startsWith(matrix.module, 'trino') ) && 'postgres:15.10-alpine3.21' || '' }}
env:
POSTGRES_USER: soda_test
POSTGRES_DB: soda_test
POSTGRES_HOST_AUTH_METHOD: trust
options: >-
--health-cmd pg_isready
--health-interval 10s
--health-timeout 5s
--health-retries 5
--health-start-period 10s
ports:
- 5432:5432
sqlserver:
image: ${{ ( matrix.module == 'sqlserver' ) && 'mcr.microsoft.com/mssql/server:2022-latest' || '' }}
env:
ACCEPT_EULA: Y
SA_PASSWORD: Password1!
ports:
- 1433:1433
options: >-
--health-cmd "/opt/mssql-tools18/bin/sqlcmd -S localhost -U sa -P Password1! -Q 'select 1' -C -b -o /dev/null"
--health-interval 1s
--health-timeout 2s
--health-retries 10
--health-start-period 10s
strategy:
fail-fast: false
matrix:
python-version: ${{ github.event_name == 'pull_request' && fromJSON('["3.14"]') || fromJSON('["3.10", "3.11", "3.12", "3.13", "3.14"]') }}
module: ${{ fromJSON(needs.define-test-matrix.outputs.modules) }}
env:
SNOWFLAKE_ACCOUNT: ${{ vars.SNOWFLAKE_CI_ACCOUNT }}
SNOWFLAKE_USER: ${{ vars.SNOWFLAKE_CI_USERNAME }}
SNOWFLAKE_DATABASE: ${{ vars.SNOWFLAKE_CI_DATABASE }}
DATABRICKS_HOST: ${{ vars.DATABRICKS_CI_HOST }}
DATABRICKS_HTTP_PATH: ${{ vars.DATABRICKS_CI_HTTP_PATH }}
DATABRICKS_CATALOG: ${{ vars.DATABRICKS_CI_CATALOG }}
REDSHIFT_HOST: ${{ vars.REDSHIFT_CI_HOST }}
REDSHIFT_USERNAME: ${{ vars.REDSHIFT_CI_USERNAME }}
REDSHIFT_DATABASE: "soda_test"
REDSHIFT_PORT: "5439"
ATHENA_S3_TEST_DIR: ${{ vars.ATHENA_CI_STAGING_DIR }}
ATHENA_SCHEMA: ${{ vars.ATHENA_CI_SCHEMA }}
ATHENA_WORKGROUP: ${{ vars.ATHENA_CI_WORKGROUP }}
SQLSERVER_USERNAME: sa
SQLSERVER_PASSWORD: Password1!
SQLSERVER_DATABASE: master
SQLSERVER_SCHEMA: dbo
SYNAPSE_HOST: ${{ vars.MICROSOFT_SYNAPSE_CI_HOST }}
SYNAPSE_DATABASE: sodacisynapse
SYNAPSE_AUTHENTICATION_TYPE: activedirectoryserviceprincipal
FABRIC_HOST: ${{ vars.MICROSOFT_FABRIC_CI_HOST }}
FABRIC_DATABASE: soda-ci-fabric-warehouse
FABRIC_AUTHENTICATION_TYPE: activedirectoryserviceprincipal
SODA_CORE_TELEMETRY_LOCAL_TEST_MODE: "true"
SODA_NIGHTLY: ${{ inputs.includeNightlyTests }}
steps:
- uses: actions/checkout@v4
- name: Resolve module
run: |
MODULE="${{ matrix.module }}"
case "$MODULE" in
trino-postgres) echo "TEST_MODULE=trino" >> "$GITHUB_ENV"; echo "TRINO_CATALOG=db" >> "$GITHUB_ENV" ;;
trino-s3) echo "TEST_MODULE=trino" >> "$GITHUB_ENV"; echo "TRINO_CATALOG=iceberg" >> "$GITHUB_ENV" ;;
*) echo "TEST_MODULE=$MODULE" >> "$GITHUB_ENV" ;;
esac
echo "CACHE_KEY=$MODULE" >> "$GITHUB_ENV"
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Set up UV
uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
- name: Install dependencies
run: |
curl https://packages.microsoft.com/keys/microsoft.asc | sudo apt-key add -
curl https://packages.microsoft.com/config/ubuntu/21.04/prod.list | sudo tee /etc/apt/sources.list.d/mssql-release.list > /dev/null
sudo apt-get update
ACCEPT_EULA=Y sudo apt-get install -y libsasl2-dev msodbcsql18
- name: Get external secrets
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802 # v2.0.10
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_BUILD_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_BUILD_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.AWS_BUILD_DEFAULT_REGION }}
with:
secret-ids: |
,/soda/github/common/data-sources/envs
parse-json-secrets: true
- name: Get external secrets (special)
uses: aws-actions/aws-secretsmanager-get-secrets@a9a7eb4e2f2871d30dc5b892576fde60a2ecc802 # v2.0.10
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_BUILD_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_BUILD_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.AWS_BUILD_DEFAULT_REGION }}
with:
secret-ids: |
BIGQUERY_ACCOUNT_INFO_JSON,/soda/github/common/data-sources/BIGQUERY_ACCOUNT_INFO_JSON
- name: Configure Trino
if: startsWith(matrix.module, 'trino')
env:
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_BUILD_ACCESS_KEY_ID }}
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_BUILD_SECRET_ACCESS_KEY }}
AWS_REGION: ${{ secrets.AWS_BUILD_DEFAULT_REGION }}
TRINO_CI_ASSUME_ROLE_ARN: ${{ secrets.TRINO_CI_ASSUME_ROLE_ARN }}
CI_MODULE: ${{ matrix.module }}
run: bash scripts/start_trino_ci.sh
# ── Restore nightly snapshots for PR replay ─────────────────────
- name: Restore snapshot cache
if: github.event_name == 'pull_request'
id: snapshot-cache
uses: actions/cache/restore@v4
with:
path: .test_snapshots/
key: snapshots-${{ env.CACHE_KEY }}-
restore-keys: |
snapshots-${{ env.CACHE_KEY }}-
- name: Check for replay override
run: |
if [ "${{ github.event_name }}" = "pull_request" ]; then
git fetch origin "${{ github.event.pull_request.head.sha }}" --depth=1
COMMIT_MSG=$(git log -1 --pretty=%s FETCH_HEAD)
else
COMMIT_MSG=$(git log -1 --pretty=%s)
fi
if [[ "$COMMIT_MSG" == "[REPLAY=OFF]"* ]]; then
echo "FORCE_SNAPSHOT_OFF=true" >> "$GITHUB_ENV"
echo "Commit message starts with [REPLAY=OFF] — forcing snapshot mode off"
fi
- name: Determine snapshot mode
run: |
if [ "$FORCE_SNAPSHOT_OFF" = "true" ]; then
echo "SODA_TEST_SNAPSHOT=off" >> "$GITHUB_ENV"
echo "Snapshot replay disabled via [REPLAY=OFF] in commit message"
# duckdb and sparkdf run fast enough without snapshots — always use real DB
elif [ "${{ env.TEST_MODULE }}" = "duckdb" ] || [ "${{ env.TEST_MODULE }}" = "sparkdf" ]; then
echo "SODA_TEST_SNAPSHOT=off" >> "$GITHUB_ENV"
echo "Running ${{ env.TEST_MODULE }} tests against real DB (no snapshots)"
elif [ "${{ github.event_name }}" = "pull_request" ] && [ -d ".test_snapshots" ]; then
echo "SODA_TEST_SNAPSHOT=replay" >> "$GITHUB_ENV"
echo "SODA_TEST_SNAPSHOT_FALLBACK=true" >> "$GITHUB_ENV"
echo "Using snapshot replay with fallback for PR"
else
echo "SODA_TEST_SNAPSHOT=off" >> "$GITHUB_ENV"
echo "Running tests against real databases"
fi
- name: Run tests
run: |
uv sync --locked --all-packages --group dev
if [ "${{ matrix.module }}" = "sparkdf" ]; then
sudo apt-get update
sudo apt-get install -y openjdk-17-jdk
export JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64
export PATH=$JAVA_HOME/bin:$PATH
echo "JAVA_HOME=$JAVA_HOME" >> "$GITHUB_ENV"
echo "PATH=$PATH" >> "$GITHUB_ENV"
fi
export TEST_DATASOURCE=${{ env.TEST_MODULE }}
uv run python -m pytest -ra soda-tests/tests/integration
if [ "${{ env.TEST_MODULE }}" = "postgres" ]; then
uv run python -m pytest -ra soda-tests/tests/unit
uv run python -m pytest -ra soda-tests/tests/feature
fi
uv run python -m pytest -ra soda-${{ env.TEST_MODULE }}/tests
if [ "${{ env.TEST_MODULE }}" = "databricks" ]; then
# hive_metastore always runs without snapshots for now
export SODA_TEST_SNAPSHOT=off
unset SODA_TEST_SNAPSHOT_FALLBACK
export DATABRICKS_CATALOG=hive_metastore
echo "Changed DATABRICKS_CATALOG environment variable to hive_metastore"
uv run python -m pytest -ra soda-tests/tests/integration
fi
# ── Run no_snapshot tests against real DB ───────────────────────
- name: Run no_snapshot tests
if: env.SODA_TEST_SNAPSHOT == 'off' || env.SODA_TEST_SNAPSHOT == 'replay'
env:
SODA_TEST_SNAPSHOT: "off"
TEST_DATASOURCE: ${{ env.TEST_MODULE }}
run: |
# Run tests marked no_snapshot against the real DB; exit 0 if none found.
uv run python -m pytest -ra soda-tests/tests/integration -m "no_snapshot" \
--no-header -q || {
rc=$?
[ $rc -eq 5 ] && exit 0
exit $rc
}
- name: Dump Trino logs
if: always() && startsWith(matrix.module, 'trino')
run: |
echo "=== Trino container logs ==="
docker logs trino-ci 2>&1 | tail -100
- name: Stop Trino container
if: always() && startsWith(matrix.module, 'trino')
run: docker rm -f trino-ci 2>/dev/null || true
define-matrix:
if: |
always() && !cancelled() && github.ref_name == 'main'
&& (needs.test.result == 'success' || needs.test.result == 'skipped')
runs-on: ubuntu-24.04
needs: [test]
outputs:
modules: ${{ steps.modules.outputs.modules }}
steps:
- uses: actions/checkout@v4
- name: Define modules
id: modules
run: |
echo modules=$(bash scripts/release_matrix.sh) >> "$GITHUB_OUTPUT"
release-to-dev-pypi:
# Mirror define-matrix's gating: without always() + an explicit
# needs.<job>.result check, GitHub Actions' default success() evaluates
# the *transitive* needs graph — so a [SKIP-TESTS] commit that skips
# `test` also skips this job, defeating the SKIP-TESTS contract
# (dev-pypi publish must still run on main).
if: |
always() && !cancelled() && github.ref_name == 'main'
&& needs.define-matrix.result == 'success'
runs-on: ubuntu-24.04
needs: [define-matrix]
strategy:
fail-fast: false
matrix:
module: ${{ fromJSON(needs.define-matrix.outputs.modules) }}
steps:
- uses: actions/checkout@v4
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Set up UV
uses: astral-sh/setup-uv@e58605a9b6da7c637471fab8847a5e5a6b8df081 # v5
- name: Debug GITHUB_REF
run: echo "GITHUB_REF=$GITHUB_REF"
- name: Release ${{ matrix.module }}
run: |
uv venv .venv
source .venv/bin/activate
uv pip install tbump build twine
if [ "${GITHUB_REF#refs/tags/}" != "$GITHUB_REF" ]; then
VERSION="${GITHUB_REF#refs/tags/}"
echo "Using tag version: $VERSION"
else
CURRENT_VERSION_ROOT="$(tbump current-version | sed -E 's/((a|b|rc)[0-9]+|\.dev[0-9]+)$//')"
echo "No tag found, bumping to dev version: $CURRENT_VERSION_ROOT"
tbump --only-patch --non-interactive ${CURRENT_VERSION_ROOT}.dev${GITHUB_RUN_NUMBER}
fi
cd ${{ matrix.module }}
python3 -m build
# Stagger uploads so many packages don't hit PyPI at once (reduces 503s)
- name: Stagger uploads
run: |
DELAY=$(echo -n "${{ matrix.module }}" | cksum | awk '{print $1 % 91}')
echo "Staggering upload by ${DELAY}s..."
sleep $DELAY
- name: Publish package to pypi
uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
with:
packages-dir: ${{ matrix.module }}/dist
user: ${{ secrets.DEV_PYPI_USERNAME }}
password: ${{ secrets.DEV_PYPI_PASSWORD }}
repository-url: ${{ secrets.DEV_PYPI_URL }}