Skip to content

Zotero Addons Scraping #5231

Zotero Addons Scraping

Zotero Addons Scraping #5231

Workflow file for this run

name: Zotero Addons Scraping
on:
workflow_dispatch: # manual
schedule:
- cron: '25 4,7,10,14,19,23 * * *' # +8(beijing)
push:
branches: [master]
permissions:
contents: write
actions: write
issues: write
env:
TZ: Asia/Shanghai
FORCE_JAVASCRIPT_ACTIONS_TO_NODE24: true
jobs:
scrape_data:
runs-on: ubuntu-latest
steps:
- name: Checkout Repository
uses: actions/checkout@v4
with:
fetch-depth: 0
- name: Checkout publish branch (for release_cache)
uses: actions/checkout@v4
with:
ref: publish
path: publish_branch
continue-on-error: true
- name: Restore release_cache from publish branch
run: |
if [ -d "publish_branch/release_cache" ]; then
cp -r publish_branch/release_cache ./release_cache
echo "Restored release_cache from publish branch"
ls -la release_cache | head -20
else
echo "No release_cache found in publish branch, starting fresh"
fi
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: 3.x
- name: Install Dependencies
run: |
pip install -r requirements.txt
- name: Determine changed addon files
id: changed_addons
if: github.event_name == 'push'
shell: bash
run: |
before_sha="${{ github.event.before }}"
if [ -z "$before_sha" ] || [ "$before_sha" = "0000000000000000000000000000000000000000" ]; then
files=$(git diff-tree --no-commit-id --name-only -r "${{ github.sha }}" -- addons || true)
else
files=$(git diff --name-only --diff-filter=AM "$before_sha" "${{ github.sha }}" -- addons || true)
fi
{
echo "changed_addons<<EOF"
echo "$files"
echo "EOF"
} >> "$GITHUB_OUTPUT"
- name: Validate changed addon tags
if: github.event_name == 'push' && steps.changed_addons.outputs.changed_addons != ''
shell: bash
env:
PYTHONPATH: src
run: |
mapfile -t files < <(printf '%s\n' "${{ steps.changed_addons.outputs.changed_addons }}" | sed '/^$/d')
if [ "${#files[@]}" -eq 0 ]; then
echo "No addon files changed"
exit 0
fi
python3 -m zotero_scraper.tag_review \
--files "${files[@]}" \
--summary-file "$GITHUB_STEP_SUMMARY"
- name: Run Scraper
run: |
python main.py \
-i addons \
-o published/addon_infos.json \
--release-cache-dir release_cache \
--github_repository "${GITHUB_REPOSITORY}" \
--github_token "${GITHUB_TOKEN}" \
--create_release True
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PYTHONPATH: src
- name: Copy release_cache to published folder
run: |
cp -r release_cache published/release_cache
echo "Contents of published folder:"
ls -la published
- name: Publish to publish branch
uses: peaceiris/actions-gh-pages@v4
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_branch: publish
user_name: 'github-actions[bot]'
user_email: 'github-actions[bot]@users.noreply.github.com'
publish_dir: ./published
force_orphan: true