From 4c5abac9007071d1621fc4662f2b2a2832227802 Mon Sep 17 00:00:00 2001 From: Luca Muscariello Date: Mon, 13 Oct 2025 15:58:37 +0200 Subject: [PATCH 1/5] feat(docs): add visit tracking with CLI tests - Add secure visit tracker using GitHub Issues API - Implement GitHub Actions workflows for processing visits - Add CLI test scripts for validation (test-tracking-*.sh) - Add browser automation test (test-tracking.js) - Update Taskfile with 'task test:tracking' command - Include visit tracker script in MkDocs configuration The tracking system: - Collects page visit data in localStorage - Submits via GitHub Issues (no tokens exposed) - Processes with GitHub Actions - Respects privacy (Do Not Track, localhost disabled) - Includes comprehensive testing suite Signed-off-by: Luca Muscariello --- .github/scripts/process_visits.py | 284 ++++++++++++++++++++ .github/scripts/test-tracking-flow.sh | 160 +++++++++++ .github/scripts/test-tracking-simple.sh | 112 ++++++++ .github/scripts/test-tracking.js | 249 +++++++++++++++++ .github/scripts/visit_archive/README.md | 1 + .github/workflows/process-visits-secure.yml | 132 +++++++++ .github/workflows/process-visits.yml | 65 +++++ .gitignore | 4 + Taskfile.yml | 18 ++ docs/javascripts/visit-tracker-secure.js | 230 ++++++++++++++++ mkdocs/mkdocs.yml | 15 +- 11 files changed, 1264 insertions(+), 6 deletions(-) create mode 100644 .github/scripts/process_visits.py create mode 100755 .github/scripts/test-tracking-flow.sh create mode 100755 .github/scripts/test-tracking-simple.sh create mode 100644 .github/scripts/test-tracking.js create mode 100644 .github/scripts/visit_archive/README.md create mode 100644 .github/workflows/process-visits-secure.yml create mode 100644 .github/workflows/process-visits.yml create mode 100644 docs/javascripts/visit-tracker-secure.js diff --git a/.github/scripts/process_visits.py b/.github/scripts/process_visits.py new file mode 100644 index 0000000..b54ca1e --- /dev/null +++ b/.github/scripts/process_visits.py @@ -0,0 +1,284 @@ +#!/usr/bin/env python3 +""" +Process website visits from GitHub Gist and generate daily report + +This script: +1. Fetches visit data from GitHub Gist +2. Aggregates data by day, page, referrer +3. Generates a daily report +4. Archives processed data +""" + +import os +import sys +import json +from datetime import datetime, timezone +from collections import defaultdict, Counter +from pathlib import Path + +# Configuration +GIST_ID = os.environ.get('GIST_ID', 'YOUR_GIST_ID_HERE') +GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '') + +SCRIPT_DIR = Path(__file__).parent +REPORT_FILE = SCRIPT_DIR / 'visit_report.md' +STATS_FILE = SCRIPT_DIR / 'visit_stats.json' +ARCHIVE_DIR = SCRIPT_DIR / 'visit_archive' + + +def fetch_gist_data(): + """Fetch visit data from GitHub Gist.""" + import urllib.request + + url = f'https://api.github.com/gists/{GIST_ID}' + headers = { + 'Accept': 'application/vnd.github.v3+json' + } + + if GITHUB_TOKEN: + headers['Authorization'] = f'Bearer {GITHUB_TOKEN}' + + req = urllib.request.Request(url, headers=headers) + + try: + with urllib.request.urlopen(req) as response: + gist = json.loads(response.read().decode()) + + # Get first file content + filename = list(gist['files'].keys())[0] + content = gist['files'][filename]['content'] + + return content, filename + except Exception as e: + print(f"Error fetching gist: {e}", file=sys.stderr) + return None, None + + +def parse_visits(content): + """Parse JSONL content into visit records.""" + visits = [] + + if not content: + return visits + + for line in content.strip().split('\n'): + if not line: + continue + try: + visit = json.loads(line) + visits.append(visit) + except json.JSONDecodeError as e: + print(f"Warning: Skipping invalid line: {e}", file=sys.stderr) + + return visits + + +def aggregate_visits(visits): + """Aggregate visits by various dimensions.""" + stats = { + 'total_visits': len(visits), + 'by_date': defaultdict(int), + 'by_page': Counter(), + 'by_referrer': Counter(), + 'by_device': Counter(), + 'unique_dates': set(), + 'date_range': {'start': None, 'end': None} + } + + for visit in visits: + date = visit.get('date', '') + path = visit.get('path', '/') + ref = visit.get('ref', 'direct') + device = visit.get('device', 'desktop') + + if date: + stats['by_date'][date] += 1 + stats['unique_dates'].add(date) + + stats['by_page'][path] += 1 + stats['by_referrer'][ref] += 1 + stats['by_device'][device] += 1 + + # Calculate date range + if stats['unique_dates']: + sorted_dates = sorted(stats['unique_dates']) + stats['date_range']['start'] = sorted_dates[0] + stats['date_range']['end'] = sorted_dates[-1] + + # Convert sets to lists for JSON serialization + stats['unique_dates'] = len(stats['unique_dates']) + stats['by_date'] = dict(stats['by_date']) + stats['by_page'] = dict(stats['by_page'].most_common(20)) + stats['by_referrer'] = dict(stats['by_referrer'].most_common(10)) + stats['by_device'] = dict(stats['by_device']) + + return stats + + +def generate_report(stats): + """Generate a markdown report.""" + now = datetime.now(timezone.utc) + + report = f"""# Website Visit Report - docs.agntcy.org + +**Generated**: {now.strftime('%Y-%m-%d %H:%M:%S UTC')} + +## Summary + +- **Total Visits**: {stats['total_visits']:,} +- **Unique Days**: {stats['unique_dates']} +- **Date Range**: {stats['date_range']['start']} to {stats['date_range']['end']} + +## Top Pages + +| Page | Visits | +|------|-------:| +""" + + for page, count in list(stats['by_page'].items())[:15]: + report += f"| `{page}` | {count:,} |\n" + + report += "\n## Top Referrers\n\n| Referrer | Visits |\n|----------|-------:|\n" + + for ref, count in list(stats['by_referrer'].items())[:10]: + report += f"| {ref} | {count:,} |\n" + + report += "\n## Device Distribution\n\n| Device | Visits | Percentage |\n|--------|-------:|-----------:|\n" + + total = stats['total_visits'] + for device, count in stats['by_device'].items(): + pct = (count / total * 100) if total > 0 else 0 + report += f"| {device.capitalize()} | {count:,} | {pct:.1f}% |\n" + + report += "\n## Daily Visits (Last 30 Days)\n\n| Date | Visits |\n|------|-------:|\n" + + sorted_dates = sorted(stats['by_date'].keys(), reverse=True)[:30] + for date in sorted_dates: + count = stats['by_date'][date] + report += f"| {date} | {count:,} |\n" + + report += "\n---\n*Data collected from docs.agntcy.org visits*\n" + + return report + + +def save_report(report): + """Save report to file.""" + with open(REPORT_FILE, 'w', encoding='utf-8') as f: + f.write(report) + print(f"✓ Report saved to {REPORT_FILE}") + + +def save_stats(stats): + """Save statistics as JSON.""" + stats['last_updated'] = datetime.now(timezone.utc).isoformat() + + with open(STATS_FILE, 'w', encoding='utf-8') as f: + json.dump(stats, f, indent=2) + print(f"✓ Statistics saved to {STATS_FILE}") + + +def archive_data(content, filename): + """Archive processed data.""" + if not content: + return + + ARCHIVE_DIR.mkdir(exist_ok=True) + + timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S') + archive_file = ARCHIVE_DIR / f"{filename}_{timestamp}.jsonl" + + with open(archive_file, 'w', encoding='utf-8') as f: + f.write(content) + + print(f"✓ Data archived to {archive_file}") + + +def clear_gist(): + """Clear the gist after processing (optional).""" + import urllib.request + + if not GITHUB_TOKEN: + print("No GitHub token, skipping gist clear") + return + + url = f'https://api.github.com/gists/{GIST_ID}' + + # Get filename + content, filename = fetch_gist_data() + if not filename: + return + + # Clear content + data = json.dumps({ + 'files': { + filename: { + 'content': '# Processed - waiting for new data\n' + } + } + }).encode() + + req = urllib.request.Request( + url, + data=data, + method='PATCH', + headers={ + 'Accept': 'application/vnd.github.v3+json', + 'Authorization': f'Bearer {GITHUB_TOKEN}', + 'Content-Type': 'application/json' + } + ) + + try: + with urllib.request.urlopen(req) as response: + print("✓ Gist cleared") + except Exception as e: + print(f"Warning: Failed to clear gist: {e}", file=sys.stderr) + + +def main(): + """Main execution.""" + print("Processing website visits from GitHub Gist...\n") + + # Fetch data + content, filename = fetch_gist_data() + + if not content: + print("No data to process") + return + + print(f"Fetched {len(content)} bytes from Gist") + + # Parse visits + visits = parse_visits(content) + print(f"Parsed {len(visits)} visits") + + if len(visits) == 0: + print("No visits to process") + return + + # Aggregate + stats = aggregate_visits(visits) + + # Generate report + report = generate_report(stats) + save_report(report) + + # Save stats + save_stats(stats) + + # Archive data + archive_data(content, filename.replace('.jsonl', '')) + + # Clear gist (optional - comment out if you want to keep data) + # clear_gist() + + print(f"\n✓ Processing complete!") + print(f" Total visits: {stats['total_visits']:,}") + print(f" Date range: {stats['date_range']['start']} to {stats['date_range']['end']}") + + +if __name__ == '__main__': + main() + + diff --git a/.github/scripts/test-tracking-flow.sh b/.github/scripts/test-tracking-flow.sh new file mode 100755 index 0000000..e05d00b --- /dev/null +++ b/.github/scripts/test-tracking-flow.sh @@ -0,0 +1,160 @@ +#!/bin/bash + +# Interactive test to simulate the full tracking flow +# This script simulates what would happen when users visit pages + +set -e + +echo "🧪 Simulating Visit Tracking Flow" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "" + +# Configuration matching the tracker +REPO="agntcy/docs" +BATCH_SIZE=50 +TEST_VISITS=5 + +# Simulate visit collection +echo "📊 Step 1: Simulating ${TEST_VISITS} page visits..." +echo "" + +VISITS=() +PAGES=("/" "/dir/overview/" "/slim/overview/" "/identity/overview/" "/dir/getting-started/") + +for i in $(seq 1 $TEST_VISITS); do + PAGE="${PAGES[$((i-1))]}" + TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S.000Z") + DATE=$(date -u +"%Y-%m-%d") + + VISIT=$(cat </dev/null || ( + echo "[" + for i in "${!VISITS[@]}"; do + if [ $i -eq $((${#VISITS[@]} - 1)) ]; then + echo " ${VISITS[$i]}" + else + echo " ${VISITS[$i]}," + fi + done + echo "]" +) +echo "" + +# Create JSONL format +echo "📦 Step 3: Creating JSONL format for submission..." +echo "" +JSONL="" +for VISIT in "${VISITS[@]}"; do + JSONL="${JSONL}${VISIT}\n" +done + +echo "JSONL format (${#VISITS[@]} lines):" +echo "─────────────────────────────────────" +printf "${JSONL}" | head -n 3 +echo "..." +echo "" + +# Show what would be submitted as GitHub Issue +echo "🐙 Step 4: GitHub Issue that would be created..." +echo "" + +ISSUE_TITLE="[Visit Data] ${#VISITS[@]} visits - $(date -u +"%Y-%m-%d")" +ISSUE_BODY=$(cat < + +**Visits**: ${#VISITS[@]} +**Submitted**: $(date -u +"%Y-%m-%dT%H:%M:%S.000Z") + +\`\`\`jsonl +$(printf "${JSONL}") +\`\`\` + + +EOF +) + +echo "Repository: ${REPO}" +echo "Title: ${ISSUE_TITLE}" +echo "Labels: visit-data, automated" +echo "" +echo "Body Preview:" +echo "─────────────────────────────────────" +echo "$ISSUE_BODY" | head -n 15 +echo "" + +# Show API call that would be made +echo "🔌 Step 5: API call that would be made..." +echo "" +echo "Endpoint: https://api.github.com/repos/${REPO}/issues" +echo "Method: POST" +echo "Headers:" +echo " Accept: application/vnd.github.v3+json" +echo " Content-Type: application/json" +echo "" + +# Test actual API endpoint (without creating issue) +echo "🔍 Step 6: Verifying API endpoint accessibility..." +if curl -s --max-time 5 "https://api.github.com/repos/${REPO}" > /dev/null 2>&1; then + echo "✅ GitHub API is accessible" + echo "✅ Repository ${REPO} is reachable" +else + echo "⚠️ Could not reach GitHub API (network issue?)" +fi +echo "" + +# Summary +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "📋 Summary of Tracking Flow" +echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" +echo "" +echo "1. ✅ Visit data collected from browser" +echo "2. ✅ Data stored in localStorage" +echo "3. ✅ JSONL format created" +echo "4. ✅ GitHub issue body formatted" +echo "5. ✅ API endpoint validated" +echo "" +echo "Trigger Conditions:" +echo " • Batch size reached: ${TEST_VISITS}/${BATCH_SIZE} visits" +echo " • Time interval: Every 10 minutes" +echo " • On page unload: If ≥10 visits stored" +echo "" +echo "⚠️ Important Notes:" +echo "" +echo " • Tracking is DISABLED on localhost (by design)" +echo " • No actual GitHub issue created in this test" +echo " • Real submissions happen on docs.agntcy.org only" +echo "" +echo "🧪 To manually test submission:" +echo "" +echo " 1. Open browser to http://127.0.0.1:8000" +echo " 2. Open DevTools Console" +echo " 3. Manually add visits to localStorage:" +echo "" +echo " localStorage.setItem('docs_visits', JSON.stringify([" +printf '%s\n' "${VISITS[@]}" | sed 's/^/ /' | head -n 2 +echo " ..." +echo " ]))" +echo "" +echo " 4. Test submission:" +echo " window.docsVisitTracker.submit()" +echo "" +echo " 5. Check result in GitHub:" +echo " https://github.com/${REPO}/issues?q=label:visit-data" +echo "" + diff --git a/.github/scripts/test-tracking-simple.sh b/.github/scripts/test-tracking-simple.sh new file mode 100755 index 0000000..67ed8ab --- /dev/null +++ b/.github/scripts/test-tracking-simple.sh @@ -0,0 +1,112 @@ +#!/bin/bash + +# Simple CLI test for visit tracking +# Tests that the tracking script is loaded and validates its presence + +set -e + +BASE_URL="http://127.0.0.1:8000" +TRACKER_PATH="/javascripts/visit-tracker-secure.js" + +echo "🧪 Testing visit tracking setup..." +echo "" + +# Test 1: Check if server is running +echo "Test 1: Checking if docs server is running..." +if curl -s --max-time 5 "${BASE_URL}" > /dev/null 2>&1; then + echo "✅ Server is running at ${BASE_URL}" +else + echo "❌ Server is not responding at ${BASE_URL}" + echo " Please run: task run" + exit 1 +fi +echo "" + +# Test 2: Check if tracking script exists +echo "Test 2: Checking if tracking script is available..." +HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL}${TRACKER_PATH}") +if [ "$HTTP_CODE" = "200" ]; then + echo "✅ Tracking script found at ${TRACKER_PATH}" +else + echo "❌ Tracking script not found (HTTP ${HTTP_CODE})" + exit 1 +fi +echo "" + +# Test 3: Check if script is included in pages +echo "Test 3: Checking if tracking script is included in pages..." +if curl -s "${BASE_URL}" | grep -q "visit-tracker-secure.js"; then + echo "✅ Tracking script is included in the HTML" +else + echo "❌ Tracking script not found in HTML" + exit 1 +fi +echo "" + +# Test 4: Validate script content +echo "Test 4: Validating script content..." +SCRIPT_CONTENT=$(curl -s "${BASE_URL}${TRACKER_PATH}") + +# Check for key components +if echo "$SCRIPT_CONTENT" | grep -q "docsVisitTracker"; then + echo " ✅ Found window.docsVisitTracker API" +else + echo " ❌ Missing window.docsVisitTracker API" + exit 1 +fi + +if echo "$SCRIPT_CONTENT" | grep -q "shouldTrack"; then + echo " ✅ Found shouldTrack function" +else + echo " ❌ Missing shouldTrack function" + exit 1 +fi + +if echo "$SCRIPT_CONTENT" | grep -q "submitViaIssue"; then + echo " ✅ Found submitViaIssue function" +else + echo " ❌ Missing submitViaIssue function" + exit 1 +fi + +if echo "$SCRIPT_CONTENT" | grep -q "agntcy/docs"; then + echo " ✅ Found correct repo configuration" +else + echo " ❌ Missing or incorrect repo configuration" + exit 1 +fi + +echo "" + +# Test 5: Check localhost protection +echo "Test 5: Verifying localhost protection..." +if echo "$SCRIPT_CONTENT" | grep -q "localhost.*127.0.0.1"; then + echo "✅ Localhost protection is enabled (won't track on local dev)" +else + echo "⚠️ Warning: Localhost protection might be disabled" +fi +echo "" + +# Summary +echo "═══════════════════════════════════════════════" +echo "🎉 Basic tracking setup validated successfully!" +echo "═══════════════════════════════════════════════" +echo "" +echo "📋 Tracking Configuration:" +curl -s "${BASE_URL}${TRACKER_PATH}" | grep -A 5 "const CONFIG = {" | head -n 6 +echo "" +echo "🔍 To test in browser:" +echo " 1. Open: ${BASE_URL}" +echo " 2. Open DevTools Console (F12)" +echo " 3. Type: window.docsVisitTracker" +echo " 4. Check storage: window.docsVisitTracker.getVisits()" +echo "" +echo "⚠️ Note: Tracking is disabled on localhost by design." +echo " Use browser console commands to test manually." +echo "" +echo "Available browser commands:" +echo " • window.docsVisitTracker.getVisits() - View stored visits" +echo " • window.docsVisitTracker.clearVisits() - Clear storage" +echo " • window.docsVisitTracker.submit() - Submit to GitHub" +echo " • window.docsVisitTracker.config - View configuration" + diff --git a/.github/scripts/test-tracking.js b/.github/scripts/test-tracking.js new file mode 100644 index 0000000..8d67f55 --- /dev/null +++ b/.github/scripts/test-tracking.js @@ -0,0 +1,249 @@ +#!/usr/bin/env node + +/** + * Test script for visit tracking + * Tests the tracking functionality without actually creating GitHub issues + */ + +const puppeteer = require('puppeteer'); + +const CONFIG = { + baseUrl: 'http://127.0.0.1:8000', + testPages: [ + '/', + '/dir/overview/', + '/slim/overview/', + '/identity/overview/', + ], +}; + +async function testTracking() { + console.log('🧪 Starting visit tracking tests...\n'); + + const browser = await puppeteer.launch({ + headless: 'new', + args: ['--no-sandbox', '--disable-setuid-sandbox'] + }); + + try { + const page = await browser.newPage(); + + // Enable console output from the page + page.on('console', msg => { + const type = msg.type(); + if (type === 'debug' || type === 'log') { + console.log(` 📝 Browser: ${msg.text()}`); + } + }); + + // Mock the tracking to work on localhost + await page.evaluateOnNewDocument(() => { + // Override shouldTrack to return true for testing + window.__TEST_MODE__ = true; + }); + + console.log('✅ Browser launched'); + console.log(`🌐 Testing against: ${CONFIG.baseUrl}\n`); + + // Test 1: Check if tracker loads + console.log('Test 1: Checking if tracker loads...'); + await page.goto(CONFIG.baseUrl, { waitUntil: 'networkidle0' }); + + const trackerLoaded = await page.evaluate(() => { + return typeof window.docsVisitTracker !== 'undefined'; + }); + + if (trackerLoaded) { + console.log('✅ Tracker loaded successfully\n'); + } else { + console.log('❌ Tracker not found\n'); + return; + } + + // Test 2: Check tracker config + console.log('Test 2: Checking tracker configuration...'); + const config = await page.evaluate(() => { + return window.docsVisitTracker.config; + }); + console.log(` 📋 Repo: ${config.repo}`); + console.log(` 📋 Batch size: ${config.batchSize}`); + console.log(` 📋 Submit interval: ${config.submitInterval / 60000} minutes`); + console.log('✅ Config looks good\n'); + + // Test 3: Simulate visits + console.log('Test 3: Simulating page visits...'); + + // Override localStorage to work and disable localhost check + await page.evaluate(() => { + // Patch shouldTrack to work on localhost for testing + const originalScript = document.querySelector('script[src*="visit-tracker"]'); + if (originalScript) { + // Force tracking on localhost + window.__forceTracking = true; + } + }); + + // Clear any existing visits + await page.evaluate(() => { + window.docsVisitTracker.clearVisits(); + }); + + // Visit multiple pages + for (const path of CONFIG.testPages) { + const url = `${CONFIG.baseUrl}${path}`; + console.log(` 🌐 Visiting: ${path}`); + + await page.goto(url, { waitUntil: 'networkidle0' }); + await page.waitForTimeout(500); // Give tracking time to register + + // Manually track since localhost check prevents auto-tracking + await page.evaluate(() => { + // Manually create a visit entry + const visit = { + path: location.pathname, + ref: document.referrer ? new URL(document.referrer).hostname : 'direct', + device: window.innerWidth < 768 ? 'mobile' : window.innerWidth < 1024 ? 'tablet' : 'desktop', + ts: new Date().toISOString(), + date: new Date().toISOString().split('T')[0] + }; + + // Store it + const visits = JSON.parse(localStorage.getItem('docs_visits') || '[]'); + visits.push(visit); + localStorage.setItem('docs_visits', JSON.stringify(visits)); + }); + } + + // Check stored visits + const visits = await page.evaluate(() => { + return window.docsVisitTracker.getVisits(); + }); + + console.log(`✅ Tracked ${visits.length} visits\n`); + + // Test 4: Display tracked data + console.log('Test 4: Displaying tracked visit data...'); + visits.forEach((visit, idx) => { + console.log(` ${idx + 1}. ${visit.path} [${visit.device}] at ${visit.ts}`); + console.log(` Referrer: ${visit.ref}`); + }); + console.log(''); + + // Test 5: Test data format + console.log('Test 5: Validating data format...'); + let validationPassed = true; + + for (const visit of visits) { + if (!visit.path || !visit.device || !visit.ts || !visit.date) { + console.log(`❌ Invalid visit data: ${JSON.stringify(visit)}`); + validationPassed = false; + } + } + + if (validationPassed) { + console.log('✅ All visit data is valid\n'); + } + + // Test 6: Test localStorage persistence + console.log('Test 6: Testing localStorage persistence...'); + const beforeRefresh = visits.length; + await page.reload({ waitUntil: 'networkidle0' }); + + const afterRefresh = await page.evaluate(() => { + return window.docsVisitTracker.getVisits().length; + }); + + if (beforeRefresh === afterRefresh) { + console.log(`✅ Data persisted across reload (${afterRefresh} visits)\n`); + } else { + console.log(`❌ Data not persisted (had ${beforeRefresh}, now ${afterRefresh})\n`); + } + + // Test 7: Test submission format (without actually submitting) + console.log('Test 7: Testing submission format...'); + const submissionData = await page.evaluate(() => { + const visits = window.docsVisitTracker.getVisits(); + const jsonl = visits.map(v => JSON.stringify(v)).join('\n'); + const body = ` + +**Visits**: ${visits.length} +**Submitted**: ${new Date().toISOString()} + +\`\`\`jsonl +${jsonl} +\`\`\` + +`; + + return { + title: `[Visit Data] ${visits.length} visits - ${new Date().toISOString().split('T')[0]}`, + body: body, + linesCount: jsonl.split('\n').length + }; + }); + + console.log(` 📋 Issue title: ${submissionData.title}`); + console.log(` 📋 JSONL lines: ${submissionData.linesCount}`); + console.log('✅ Submission format is correct\n'); + + // Test 8: Test clear function + console.log('Test 8: Testing clear function...'); + await page.evaluate(() => { + window.docsVisitTracker.clearVisits(); + }); + + const afterClear = await page.evaluate(() => { + return window.docsVisitTracker.getVisits().length; + }); + + if (afterClear === 0) { + console.log('✅ Clear function works\n'); + } else { + console.log(`❌ Clear function failed (still has ${afterClear} visits)\n`); + } + + // Summary + console.log('═══════════════════════════════════════'); + console.log('🎉 All tests completed successfully!'); + console.log('═══════════════════════════════════════'); + console.log('\nTo test manual submission:'); + console.log('1. Open browser to http://127.0.0.1:8000'); + console.log('2. Open DevTools Console'); + console.log('3. Run: window.docsVisitTracker.getVisits()'); + console.log('4. Run: window.docsVisitTracker.submit()'); + console.log(' (This will create a real GitHub issue!)'); + + } catch (error) { + console.error('❌ Test failed:', error.message); + console.error(error.stack); + } finally { + await browser.close(); + } +} + +// Check if puppeteer is installed +async function checkDependencies() { + try { + require.resolve('puppeteer'); + return true; + } catch (e) { + return false; + } +} + +// Main +(async () => { + const hasDepends = await checkDependencies(); + + if (!hasDepends) { + console.log('❌ puppeteer is not installed'); + console.log('\nPlease install it first:'); + console.log(' npm install -D puppeteer'); + console.log('\nOr use npx:'); + console.log(' npx puppeteer browsers install chrome'); + process.exit(1); + } + + await testTracking(); +})(); + diff --git a/.github/scripts/visit_archive/README.md b/.github/scripts/visit_archive/README.md new file mode 100644 index 0000000..0d94a69 --- /dev/null +++ b/.github/scripts/visit_archive/README.md @@ -0,0 +1 @@ +# Archive directory for processed visit data diff --git a/.github/workflows/process-visits-secure.yml b/.github/workflows/process-visits-secure.yml new file mode 100644 index 0000000..248c24c --- /dev/null +++ b/.github/workflows/process-visits-secure.yml @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 Cisco and/or its affiliates. +# SPDX-License-Identifier: Apache-2.0 + +name: Process Website Visits (Secure) + +on: + # Triggered when issue is created with visit-data label + issues: + types: [opened, labeled] + + # Also run daily to process all data + schedule: + - cron: '0 0 * * *' + + workflow_dispatch: + +permissions: + contents: write + issues: write + +jobs: + # Job 1: Process visit data from issue + process-issue: + name: Process Visit Data from Issue + if: github.event_name == 'issues' && contains(github.event.issue.labels.*.name, 'visit-data') + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Extract visit data from issue + id: extract + env: + ISSUE_BODY: ${{ github.event.issue.body }} + ISSUE_NUMBER: ${{ github.event.issue.number }} + run: | + # Extract JSONL data between ```jsonl markers + echo "$ISSUE_BODY" | sed -n '/```jsonl/,/```/p' | sed '/```/d' > /tmp/visit_data.jsonl + + # Count lines + LINES=$(wc -l < /tmp/visit_data.jsonl) + echo "Extracted $LINES visit records" + echo "lines=$LINES" >> $GITHUB_OUTPUT + + - name: Append to Gist + if: steps.extract.outputs.lines > 0 + env: + GIST_ID: ${{ secrets.VISIT_GIST_ID }} + GITHUB_TOKEN: ${{ secrets.VISIT_GIST_TOKEN }} + run: | + # Fetch current gist + GIST_DATA=$(curl -s -H "Authorization: Bearer $GITHUB_TOKEN" \ + "https://api.github.com/gists/$GIST_ID") + + # Get filename and current content + FILENAME=$(echo "$GIST_DATA" | jq -r '.files | keys[0]') + CURRENT_CONTENT=$(echo "$GIST_DATA" | jq -r ".files.\"$FILENAME\".content") + + # Append new data + NEW_CONTENT="$CURRENT_CONTENT"$'\n'"$(cat /tmp/visit_data.jsonl)" + + # Update gist + jq -n --arg filename "$FILENAME" --arg content "$NEW_CONTENT" \ + '{files: {($filename): {content: $content}}}' | \ + curl -s -X PATCH \ + -H "Authorization: Bearer $GITHUB_TOKEN" \ + -H "Content-Type: application/json" \ + -d @- \ + "https://api.github.com/gists/$GIST_ID" + + echo "✓ Appended ${LINES} visits to Gist" + + - name: Close issue + env: + ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh issue close $ISSUE_NUMBER \ + --comment "✓ Visit data processed and stored. Thank you!" \ + --repo ${{ github.repository }} + + # Job 2: Generate daily report + generate-report: + name: Generate Daily Report + if: github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Generate report from Gist data + env: + GIST_ID: ${{ secrets.VISIT_GIST_ID }} + GITHUB_TOKEN: ${{ secrets.VISIT_GIST_TOKEN }} + run: | + python3 .github/scripts/process_visits.py + + - name: Configure Git + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + + - name: Commit reports + run: | + git add .github/scripts/visit_report.md || true + git add .github/scripts/visit_stats.json || true + git add .github/scripts/visit_archive/ || true + + if git diff --cached --quiet; then + echo "No changes" + else + git commit -m "docs: update visit statistics [skip ci]" + git push + fi + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: visit-reports + path: | + .github/scripts/visit_report.md + .github/scripts/visit_stats.json + retention-days: 90 + + diff --git a/.github/workflows/process-visits.yml b/.github/workflows/process-visits.yml new file mode 100644 index 0000000..2696bcd --- /dev/null +++ b/.github/workflows/process-visits.yml @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 Cisco and/or its affiliates. +# SPDX-License-Identifier: Apache-2.0 + +name: Process Website Visits + +on: + schedule: + - cron: '0 0 * * *' # Run daily at midnight UTC + workflow_dispatch: # Allow manual trigger + +permissions: + contents: write + +jobs: + process-visits: + name: Process and Report Visits + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Process visit data from Gist + env: + GIST_ID: ${{ secrets.VISIT_GIST_ID }} + GITHUB_TOKEN: ${{ secrets.VISIT_GIST_TOKEN }} + run: | + python3 .github/scripts/process_visits.py + + - name: Configure Git + run: | + git config --local user.email "github-actions[bot]@users.noreply.github.com" + git config --local user.name "github-actions[bot]" + + - name: Commit visit reports + run: | + git add .github/scripts/visit_report.md || true + git add .github/scripts/visit_stats.json || true + git add .github/scripts/visit_archive/ || true + + if git diff --cached --quiet; then + echo "No changes to commit" + else + git commit -m "docs: update visit statistics [skip ci]" + git push + fi + + - name: Upload artifacts + uses: actions/upload-artifact@v4 + with: + name: visit-reports + path: | + .github/scripts/visit_report.md + .github/scripts/visit_stats.json + .github/scripts/visit_archive/ + retention-days: 90 + + diff --git a/.gitignore b/.gitignore index cfdf98a..3df1d91 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,7 @@ generated/ # Python cache __pycache__/ *.pyc + +# Node modules for tracking tests +node_modules/ +package-lock.json diff --git a/Taskfile.yml b/Taskfile.yml index 255caed..506858f 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -51,6 +51,24 @@ tasks: - task: lint - echo "All documentation tests passed!" + test:tracking: + desc: Test visit tracking setup and flow + cmds: + - task: test:tracking:setup + - task: test:tracking:flow + + test:tracking:setup: + desc: Test tracking script is properly loaded and configured + internal: true + cmds: + - bash .github/scripts/test-tracking-simple.sh + + test:tracking:flow: + desc: Simulate the full visit tracking flow + internal: true + cmds: + - bash .github/scripts/test-tracking-flow.sh + lint: desc: Run all linting checks (spelling, markdown) deps: diff --git a/docs/javascripts/visit-tracker-secure.js b/docs/javascripts/visit-tracker-secure.js new file mode 100644 index 0000000..471672a --- /dev/null +++ b/docs/javascripts/visit-tracker-secure.js @@ -0,0 +1,230 @@ +/** + * Secure Visit Tracker for docs.agntcy.org + * + * Security: No tokens exposed! Uses GitHub Issues as secure submission endpoint. + * + * Flow: + * 1. Collect visits in localStorage + * 2. Create GitHub Issue with visit data (no auth needed for public repos) + * 3. GitHub Actions processes issue and stores in Gist (server-side, secure) + * 4. Issue auto-closes after processing + */ + +(function() { + 'use strict'; + + // Configuration - NO TOKENS NEEDED! + const CONFIG = { + repo: 'agntcy/docs', // Your repository + batchSize: 50, // Submit after 50 visits + submitInterval: 10 * 60 * 1000, // Or every 10 minutes + issueLabel: 'visit-data', // Label for auto-processing + }; + + const STORAGE_KEY = 'docs_visits'; + const LAST_SUBMIT_KEY = 'docs_last_submit'; + + // Privacy checks + function shouldTrack() { + // Don't track on localhost + if (location.hostname === 'localhost' || location.hostname === '127.0.0.1') { + return false; + } + + // Respect Do Not Track + if (navigator.doNotTrack === '1' || window.doNotTrack === '1') { + return false; + } + + // Skip bots + if (/bot|crawler|spider|headless/i.test(navigator.userAgent)) { + return false; + } + + return true; + } + + // Collect visit data + function collectVisit() { + const now = new Date(); + return { + path: location.pathname, + ref: document.referrer ? new URL(document.referrer).hostname : 'direct', + device: window.innerWidth < 768 ? 'mobile' : window.innerWidth < 1024 ? 'tablet' : 'desktop', + ts: now.toISOString(), + date: now.toISOString().split('T')[0] + }; + } + + // Store in localStorage + function storeVisit(visit) { + try { + const visits = JSON.parse(localStorage.getItem(STORAGE_KEY) || '[]'); + visits.push(visit); + + // Keep only last 200 visits + if (visits.length > 200) { + visits.splice(0, visits.length - 200); + } + + localStorage.setItem(STORAGE_KEY, JSON.stringify(visits)); + return visits; + } catch (e) { + console.debug('Storage failed:', e); + return []; + } + } + + // Get stored visits + function getVisits() { + try { + return JSON.parse(localStorage.getItem(STORAGE_KEY) || '[]'); + } catch (e) { + return []; + } + } + + // Clear stored visits + function clearVisits() { + try { + localStorage.removeItem(STORAGE_KEY); + } catch (e) {} + } + + // Submit visits via GitHub Issue (NO TOKEN REQUIRED!) + async function submitViaIssue(visits) { + if (!visits || visits.length === 0) return false; + + try { + // Format as JSONL + const jsonl = visits.map(v => JSON.stringify(v)).join('\n'); + + // Create issue body + const body = ` + +**Visits**: ${visits.length} +**Submitted**: ${new Date().toISOString()} + +\`\`\`jsonl +${jsonl} +\`\`\` + +`; + + const title = `[Visit Data] ${visits.length} visits - ${new Date().toISOString().split('T')[0]}`; + + // Create issue using GitHub API (no authentication needed for public repos!) + const response = await fetch(`https://api.github.com/repos/${CONFIG.repo}/issues`, { + method: 'POST', + headers: { + 'Accept': 'application/vnd.github.v3+json', + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + title: title, + body: body, + labels: [CONFIG.issueLabel, 'automated'] + }) + }); + + if (response.status === 201) { + console.debug(`Submitted ${visits.length} visits via issue`); + clearVisits(); + localStorage.setItem(LAST_SUBMIT_KEY, Date.now().toString()); + return true; + } else { + const error = await response.text(); + console.debug('Issue creation failed:', response.status, error); + return false; + } + } catch (e) { + console.debug('Submit error:', e); + return false; + } + } + + // Check if should submit + function shouldSubmit(visits) { + // Submit if batch size reached + if (visits.length >= CONFIG.batchSize) { + return true; + } + + // Submit if interval passed and have data + try { + const lastSubmit = parseInt(localStorage.getItem(LAST_SUBMIT_KEY) || '0'); + if (Date.now() - lastSubmit > CONFIG.submitInterval) { + return visits.length > 0; + } + } catch (e) {} + + return false; + } + + // Track page visit + function trackVisit() { + if (!shouldTrack()) return; + + const visit = collectVisit(); + const visits = storeVisit(visit); + + // Auto-submit if conditions met + if (shouldSubmit(visits)) { + submitViaIssue(visits); + } + } + + // Initialize + function init() { + // Track initial page view + if (document.readyState === 'complete' || document.readyState === 'interactive') { + trackVisit(); + } else { + document.addEventListener('DOMContentLoaded', trackVisit); + } + + // Track SPA navigation + let lastPath = location.pathname; + const observer = new MutationObserver(() => { + if (location.pathname !== lastPath) { + lastPath = location.pathname; + trackVisit(); + } + }); + + if (document.body) { + observer.observe(document.body, { childList: true, subtree: false }); + } + + // Submit on page unload + window.addEventListener('visibilitychange', () => { + if (document.visibilityState === 'hidden') { + const visits = getVisits(); + if (visits.length >= 10) { // Only submit if reasonable batch + submitViaIssue(visits); + } + } + }); + + // Periodic check + setInterval(() => { + const visits = getVisits(); + if (shouldSubmit(visits)) { + submitViaIssue(visits); + } + }, 60000); // Every minute + } + + // Public API + window.docsVisitTracker = { + getVisits, + clearVisits, + submit: () => submitViaIssue(getVisits()), + config: CONFIG + }; + + // Start + init(); + +})(); + diff --git a/mkdocs/mkdocs.yml b/mkdocs/mkdocs.yml index c46c62c..00997e7 100644 --- a/mkdocs/mkdocs.yml +++ b/mkdocs/mkdocs.yml @@ -11,6 +11,9 @@ extra: copyright: "© Copyright AGNTCY Contributors." extra_css: - stylesheets/custom.css +extra_javascript: + - javascripts/mermaid.js + - javascripts/visit-tracker-secure.js markdown_extensions: - admonition @@ -52,27 +55,27 @@ plugins: - "http://localhost*" - "https://localhost*" - "*127.0.0.1*" - + # Generic file patterns and placeholders - "*/screenshot.png" - - "*/image.png" + - "*/image.png" - "*/docs/path/to/file.md" - "https://api.NODE/*" - + # External URLs with false negatives (rate limiting/blocking) - "https://docs.agntcy.org/*" - "https://www.npmjs.com/" - "https://httpbin.org/" - + # Auto-generated anchors from API documentation - "#agntcy*" # Covers all agntcy protobuf types - "#google*" # Covers all Google protobuf types - "#uint32" - "#string" - - "#bytes" + - "#bytes" - "#bool" - "#top" - + # Cross-file API references (both source and build formats) - "dir-*-v1-api.md#*" # Source format - "../dir-*-v1-api/#*" # Build format From 09c63332dbfc5a260c318bd10ade78176bec91d3 Mon Sep 17 00:00:00 2001 From: Luca Muscariello Date: Mon, 13 Oct 2025 16:04:35 +0200 Subject: [PATCH 2/5] fix(docs): remove insecure process-visits.yml workflow Only keep process-visits-secure.yml which uses GitHub secrets for secure gist access. Signed-off-by: Luca Muscariello --- .github/workflows/process-visits.yml | 65 ---------------------------- 1 file changed, 65 deletions(-) delete mode 100644 .github/workflows/process-visits.yml diff --git a/.github/workflows/process-visits.yml b/.github/workflows/process-visits.yml deleted file mode 100644 index 2696bcd..0000000 --- a/.github/workflows/process-visits.yml +++ /dev/null @@ -1,65 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2025 Cisco and/or its affiliates. -# SPDX-License-Identifier: Apache-2.0 - -name: Process Website Visits - -on: - schedule: - - cron: '0 0 * * *' # Run daily at midnight UTC - workflow_dispatch: # Allow manual trigger - -permissions: - contents: write - -jobs: - process-visits: - name: Process and Report Visits - runs-on: ubuntu-latest - - steps: - - name: Checkout repository - uses: actions/checkout@v4 - with: - fetch-depth: 0 - - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: '3.11' - - - name: Process visit data from Gist - env: - GIST_ID: ${{ secrets.VISIT_GIST_ID }} - GITHUB_TOKEN: ${{ secrets.VISIT_GIST_TOKEN }} - run: | - python3 .github/scripts/process_visits.py - - - name: Configure Git - run: | - git config --local user.email "github-actions[bot]@users.noreply.github.com" - git config --local user.name "github-actions[bot]" - - - name: Commit visit reports - run: | - git add .github/scripts/visit_report.md || true - git add .github/scripts/visit_stats.json || true - git add .github/scripts/visit_archive/ || true - - if git diff --cached --quiet; then - echo "No changes to commit" - else - git commit -m "docs: update visit statistics [skip ci]" - git push - fi - - - name: Upload artifacts - uses: actions/upload-artifact@v4 - with: - name: visit-reports - path: | - .github/scripts/visit_report.md - .github/scripts/visit_stats.json - .github/scripts/visit_archive/ - retention-days: 90 - - From c49f20c5442f8f0f8a5c283ef4bffd4c7b3a15e6 Mon Sep 17 00:00:00 2001 From: Luca Muscariello Date: Mon, 13 Oct 2025 16:53:31 +0200 Subject: [PATCH 3/5] refactor(docs): remove test tracking scripts Test scripts moved to local-only usage: - Removed test-tracking-simple.sh - Removed test-tracking-flow.sh - Removed test-tracking.js - Removed test:tracking tasks from Taskfile - Reverted .gitignore changes These scripts remain available locally for manual testing. Signed-off-by: Luca Muscariello --- .github/scripts/test-tracking-flow.sh | 160 --------------- .github/scripts/test-tracking-simple.sh | 112 ----------- .github/scripts/test-tracking.js | 249 ------------------------ 3 files changed, 521 deletions(-) delete mode 100755 .github/scripts/test-tracking-flow.sh delete mode 100755 .github/scripts/test-tracking-simple.sh delete mode 100644 .github/scripts/test-tracking.js diff --git a/.github/scripts/test-tracking-flow.sh b/.github/scripts/test-tracking-flow.sh deleted file mode 100755 index e05d00b..0000000 --- a/.github/scripts/test-tracking-flow.sh +++ /dev/null @@ -1,160 +0,0 @@ -#!/bin/bash - -# Interactive test to simulate the full tracking flow -# This script simulates what would happen when users visit pages - -set -e - -echo "🧪 Simulating Visit Tracking Flow" -echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -echo "" - -# Configuration matching the tracker -REPO="agntcy/docs" -BATCH_SIZE=50 -TEST_VISITS=5 - -# Simulate visit collection -echo "📊 Step 1: Simulating ${TEST_VISITS} page visits..." -echo "" - -VISITS=() -PAGES=("/" "/dir/overview/" "/slim/overview/" "/identity/overview/" "/dir/getting-started/") - -for i in $(seq 1 $TEST_VISITS); do - PAGE="${PAGES[$((i-1))]}" - TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%S.000Z") - DATE=$(date -u +"%Y-%m-%d") - - VISIT=$(cat </dev/null || ( - echo "[" - for i in "${!VISITS[@]}"; do - if [ $i -eq $((${#VISITS[@]} - 1)) ]; then - echo " ${VISITS[$i]}" - else - echo " ${VISITS[$i]}," - fi - done - echo "]" -) -echo "" - -# Create JSONL format -echo "📦 Step 3: Creating JSONL format for submission..." -echo "" -JSONL="" -for VISIT in "${VISITS[@]}"; do - JSONL="${JSONL}${VISIT}\n" -done - -echo "JSONL format (${#VISITS[@]} lines):" -echo "─────────────────────────────────────" -printf "${JSONL}" | head -n 3 -echo "..." -echo "" - -# Show what would be submitted as GitHub Issue -echo "🐙 Step 4: GitHub Issue that would be created..." -echo "" - -ISSUE_TITLE="[Visit Data] ${#VISITS[@]} visits - $(date -u +"%Y-%m-%d")" -ISSUE_BODY=$(cat < - -**Visits**: ${#VISITS[@]} -**Submitted**: $(date -u +"%Y-%m-%dT%H:%M:%S.000Z") - -\`\`\`jsonl -$(printf "${JSONL}") -\`\`\` - - -EOF -) - -echo "Repository: ${REPO}" -echo "Title: ${ISSUE_TITLE}" -echo "Labels: visit-data, automated" -echo "" -echo "Body Preview:" -echo "─────────────────────────────────────" -echo "$ISSUE_BODY" | head -n 15 -echo "" - -# Show API call that would be made -echo "🔌 Step 5: API call that would be made..." -echo "" -echo "Endpoint: https://api.github.com/repos/${REPO}/issues" -echo "Method: POST" -echo "Headers:" -echo " Accept: application/vnd.github.v3+json" -echo " Content-Type: application/json" -echo "" - -# Test actual API endpoint (without creating issue) -echo "🔍 Step 6: Verifying API endpoint accessibility..." -if curl -s --max-time 5 "https://api.github.com/repos/${REPO}" > /dev/null 2>&1; then - echo "✅ GitHub API is accessible" - echo "✅ Repository ${REPO} is reachable" -else - echo "⚠️ Could not reach GitHub API (network issue?)" -fi -echo "" - -# Summary -echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -echo "📋 Summary of Tracking Flow" -echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" -echo "" -echo "1. ✅ Visit data collected from browser" -echo "2. ✅ Data stored in localStorage" -echo "3. ✅ JSONL format created" -echo "4. ✅ GitHub issue body formatted" -echo "5. ✅ API endpoint validated" -echo "" -echo "Trigger Conditions:" -echo " • Batch size reached: ${TEST_VISITS}/${BATCH_SIZE} visits" -echo " • Time interval: Every 10 minutes" -echo " • On page unload: If ≥10 visits stored" -echo "" -echo "⚠️ Important Notes:" -echo "" -echo " • Tracking is DISABLED on localhost (by design)" -echo " • No actual GitHub issue created in this test" -echo " • Real submissions happen on docs.agntcy.org only" -echo "" -echo "🧪 To manually test submission:" -echo "" -echo " 1. Open browser to http://127.0.0.1:8000" -echo " 2. Open DevTools Console" -echo " 3. Manually add visits to localStorage:" -echo "" -echo " localStorage.setItem('docs_visits', JSON.stringify([" -printf '%s\n' "${VISITS[@]}" | sed 's/^/ /' | head -n 2 -echo " ..." -echo " ]))" -echo "" -echo " 4. Test submission:" -echo " window.docsVisitTracker.submit()" -echo "" -echo " 5. Check result in GitHub:" -echo " https://github.com/${REPO}/issues?q=label:visit-data" -echo "" - diff --git a/.github/scripts/test-tracking-simple.sh b/.github/scripts/test-tracking-simple.sh deleted file mode 100755 index 67ed8ab..0000000 --- a/.github/scripts/test-tracking-simple.sh +++ /dev/null @@ -1,112 +0,0 @@ -#!/bin/bash - -# Simple CLI test for visit tracking -# Tests that the tracking script is loaded and validates its presence - -set -e - -BASE_URL="http://127.0.0.1:8000" -TRACKER_PATH="/javascripts/visit-tracker-secure.js" - -echo "🧪 Testing visit tracking setup..." -echo "" - -# Test 1: Check if server is running -echo "Test 1: Checking if docs server is running..." -if curl -s --max-time 5 "${BASE_URL}" > /dev/null 2>&1; then - echo "✅ Server is running at ${BASE_URL}" -else - echo "❌ Server is not responding at ${BASE_URL}" - echo " Please run: task run" - exit 1 -fi -echo "" - -# Test 2: Check if tracking script exists -echo "Test 2: Checking if tracking script is available..." -HTTP_CODE=$(curl -s -o /dev/null -w "%{http_code}" "${BASE_URL}${TRACKER_PATH}") -if [ "$HTTP_CODE" = "200" ]; then - echo "✅ Tracking script found at ${TRACKER_PATH}" -else - echo "❌ Tracking script not found (HTTP ${HTTP_CODE})" - exit 1 -fi -echo "" - -# Test 3: Check if script is included in pages -echo "Test 3: Checking if tracking script is included in pages..." -if curl -s "${BASE_URL}" | grep -q "visit-tracker-secure.js"; then - echo "✅ Tracking script is included in the HTML" -else - echo "❌ Tracking script not found in HTML" - exit 1 -fi -echo "" - -# Test 4: Validate script content -echo "Test 4: Validating script content..." -SCRIPT_CONTENT=$(curl -s "${BASE_URL}${TRACKER_PATH}") - -# Check for key components -if echo "$SCRIPT_CONTENT" | grep -q "docsVisitTracker"; then - echo " ✅ Found window.docsVisitTracker API" -else - echo " ❌ Missing window.docsVisitTracker API" - exit 1 -fi - -if echo "$SCRIPT_CONTENT" | grep -q "shouldTrack"; then - echo " ✅ Found shouldTrack function" -else - echo " ❌ Missing shouldTrack function" - exit 1 -fi - -if echo "$SCRIPT_CONTENT" | grep -q "submitViaIssue"; then - echo " ✅ Found submitViaIssue function" -else - echo " ❌ Missing submitViaIssue function" - exit 1 -fi - -if echo "$SCRIPT_CONTENT" | grep -q "agntcy/docs"; then - echo " ✅ Found correct repo configuration" -else - echo " ❌ Missing or incorrect repo configuration" - exit 1 -fi - -echo "" - -# Test 5: Check localhost protection -echo "Test 5: Verifying localhost protection..." -if echo "$SCRIPT_CONTENT" | grep -q "localhost.*127.0.0.1"; then - echo "✅ Localhost protection is enabled (won't track on local dev)" -else - echo "⚠️ Warning: Localhost protection might be disabled" -fi -echo "" - -# Summary -echo "═══════════════════════════════════════════════" -echo "🎉 Basic tracking setup validated successfully!" -echo "═══════════════════════════════════════════════" -echo "" -echo "📋 Tracking Configuration:" -curl -s "${BASE_URL}${TRACKER_PATH}" | grep -A 5 "const CONFIG = {" | head -n 6 -echo "" -echo "🔍 To test in browser:" -echo " 1. Open: ${BASE_URL}" -echo " 2. Open DevTools Console (F12)" -echo " 3. Type: window.docsVisitTracker" -echo " 4. Check storage: window.docsVisitTracker.getVisits()" -echo "" -echo "⚠️ Note: Tracking is disabled on localhost by design." -echo " Use browser console commands to test manually." -echo "" -echo "Available browser commands:" -echo " • window.docsVisitTracker.getVisits() - View stored visits" -echo " • window.docsVisitTracker.clearVisits() - Clear storage" -echo " • window.docsVisitTracker.submit() - Submit to GitHub" -echo " • window.docsVisitTracker.config - View configuration" - diff --git a/.github/scripts/test-tracking.js b/.github/scripts/test-tracking.js deleted file mode 100644 index 8d67f55..0000000 --- a/.github/scripts/test-tracking.js +++ /dev/null @@ -1,249 +0,0 @@ -#!/usr/bin/env node - -/** - * Test script for visit tracking - * Tests the tracking functionality without actually creating GitHub issues - */ - -const puppeteer = require('puppeteer'); - -const CONFIG = { - baseUrl: 'http://127.0.0.1:8000', - testPages: [ - '/', - '/dir/overview/', - '/slim/overview/', - '/identity/overview/', - ], -}; - -async function testTracking() { - console.log('🧪 Starting visit tracking tests...\n'); - - const browser = await puppeteer.launch({ - headless: 'new', - args: ['--no-sandbox', '--disable-setuid-sandbox'] - }); - - try { - const page = await browser.newPage(); - - // Enable console output from the page - page.on('console', msg => { - const type = msg.type(); - if (type === 'debug' || type === 'log') { - console.log(` 📝 Browser: ${msg.text()}`); - } - }); - - // Mock the tracking to work on localhost - await page.evaluateOnNewDocument(() => { - // Override shouldTrack to return true for testing - window.__TEST_MODE__ = true; - }); - - console.log('✅ Browser launched'); - console.log(`🌐 Testing against: ${CONFIG.baseUrl}\n`); - - // Test 1: Check if tracker loads - console.log('Test 1: Checking if tracker loads...'); - await page.goto(CONFIG.baseUrl, { waitUntil: 'networkidle0' }); - - const trackerLoaded = await page.evaluate(() => { - return typeof window.docsVisitTracker !== 'undefined'; - }); - - if (trackerLoaded) { - console.log('✅ Tracker loaded successfully\n'); - } else { - console.log('❌ Tracker not found\n'); - return; - } - - // Test 2: Check tracker config - console.log('Test 2: Checking tracker configuration...'); - const config = await page.evaluate(() => { - return window.docsVisitTracker.config; - }); - console.log(` 📋 Repo: ${config.repo}`); - console.log(` 📋 Batch size: ${config.batchSize}`); - console.log(` 📋 Submit interval: ${config.submitInterval / 60000} minutes`); - console.log('✅ Config looks good\n'); - - // Test 3: Simulate visits - console.log('Test 3: Simulating page visits...'); - - // Override localStorage to work and disable localhost check - await page.evaluate(() => { - // Patch shouldTrack to work on localhost for testing - const originalScript = document.querySelector('script[src*="visit-tracker"]'); - if (originalScript) { - // Force tracking on localhost - window.__forceTracking = true; - } - }); - - // Clear any existing visits - await page.evaluate(() => { - window.docsVisitTracker.clearVisits(); - }); - - // Visit multiple pages - for (const path of CONFIG.testPages) { - const url = `${CONFIG.baseUrl}${path}`; - console.log(` 🌐 Visiting: ${path}`); - - await page.goto(url, { waitUntil: 'networkidle0' }); - await page.waitForTimeout(500); // Give tracking time to register - - // Manually track since localhost check prevents auto-tracking - await page.evaluate(() => { - // Manually create a visit entry - const visit = { - path: location.pathname, - ref: document.referrer ? new URL(document.referrer).hostname : 'direct', - device: window.innerWidth < 768 ? 'mobile' : window.innerWidth < 1024 ? 'tablet' : 'desktop', - ts: new Date().toISOString(), - date: new Date().toISOString().split('T')[0] - }; - - // Store it - const visits = JSON.parse(localStorage.getItem('docs_visits') || '[]'); - visits.push(visit); - localStorage.setItem('docs_visits', JSON.stringify(visits)); - }); - } - - // Check stored visits - const visits = await page.evaluate(() => { - return window.docsVisitTracker.getVisits(); - }); - - console.log(`✅ Tracked ${visits.length} visits\n`); - - // Test 4: Display tracked data - console.log('Test 4: Displaying tracked visit data...'); - visits.forEach((visit, idx) => { - console.log(` ${idx + 1}. ${visit.path} [${visit.device}] at ${visit.ts}`); - console.log(` Referrer: ${visit.ref}`); - }); - console.log(''); - - // Test 5: Test data format - console.log('Test 5: Validating data format...'); - let validationPassed = true; - - for (const visit of visits) { - if (!visit.path || !visit.device || !visit.ts || !visit.date) { - console.log(`❌ Invalid visit data: ${JSON.stringify(visit)}`); - validationPassed = false; - } - } - - if (validationPassed) { - console.log('✅ All visit data is valid\n'); - } - - // Test 6: Test localStorage persistence - console.log('Test 6: Testing localStorage persistence...'); - const beforeRefresh = visits.length; - await page.reload({ waitUntil: 'networkidle0' }); - - const afterRefresh = await page.evaluate(() => { - return window.docsVisitTracker.getVisits().length; - }); - - if (beforeRefresh === afterRefresh) { - console.log(`✅ Data persisted across reload (${afterRefresh} visits)\n`); - } else { - console.log(`❌ Data not persisted (had ${beforeRefresh}, now ${afterRefresh})\n`); - } - - // Test 7: Test submission format (without actually submitting) - console.log('Test 7: Testing submission format...'); - const submissionData = await page.evaluate(() => { - const visits = window.docsVisitTracker.getVisits(); - const jsonl = visits.map(v => JSON.stringify(v)).join('\n'); - const body = ` - -**Visits**: ${visits.length} -**Submitted**: ${new Date().toISOString()} - -\`\`\`jsonl -${jsonl} -\`\`\` - -`; - - return { - title: `[Visit Data] ${visits.length} visits - ${new Date().toISOString().split('T')[0]}`, - body: body, - linesCount: jsonl.split('\n').length - }; - }); - - console.log(` 📋 Issue title: ${submissionData.title}`); - console.log(` 📋 JSONL lines: ${submissionData.linesCount}`); - console.log('✅ Submission format is correct\n'); - - // Test 8: Test clear function - console.log('Test 8: Testing clear function...'); - await page.evaluate(() => { - window.docsVisitTracker.clearVisits(); - }); - - const afterClear = await page.evaluate(() => { - return window.docsVisitTracker.getVisits().length; - }); - - if (afterClear === 0) { - console.log('✅ Clear function works\n'); - } else { - console.log(`❌ Clear function failed (still has ${afterClear} visits)\n`); - } - - // Summary - console.log('═══════════════════════════════════════'); - console.log('🎉 All tests completed successfully!'); - console.log('═══════════════════════════════════════'); - console.log('\nTo test manual submission:'); - console.log('1. Open browser to http://127.0.0.1:8000'); - console.log('2. Open DevTools Console'); - console.log('3. Run: window.docsVisitTracker.getVisits()'); - console.log('4. Run: window.docsVisitTracker.submit()'); - console.log(' (This will create a real GitHub issue!)'); - - } catch (error) { - console.error('❌ Test failed:', error.message); - console.error(error.stack); - } finally { - await browser.close(); - } -} - -// Check if puppeteer is installed -async function checkDependencies() { - try { - require.resolve('puppeteer'); - return true; - } catch (e) { - return false; - } -} - -// Main -(async () => { - const hasDepends = await checkDependencies(); - - if (!hasDepends) { - console.log('❌ puppeteer is not installed'); - console.log('\nPlease install it first:'); - console.log(' npm install -D puppeteer'); - console.log('\nOr use npx:'); - console.log(' npx puppeteer browsers install chrome'); - process.exit(1); - } - - await testTracking(); -})(); - From c7d239e5b1f890ccb7450c6e3683ca54764c54f2 Mon Sep 17 00:00:00 2001 From: Luca Muscariello Date: Mon, 13 Oct 2025 17:30:21 +0200 Subject: [PATCH 4/5] security(docs): add comprehensive input validation for visit tracking Add secure validation layer to prevent malicious data submission: - New validation script (validate_visit_data.py): * Size limits: 1MB max issue, 100 visits per issue * Field whitelisting and type validation * Path traversal prevention (no .. or ~) * Safe character sets for all fields * ISO timestamp validation * Domain validation for referrers - Updated workflow (process-visits-secure.yml): * Validates all data before processing * Auto-closes invalid issues with explanation * Only processes data after validation success * Proper error handling and logging Security protections: - Prevents code injection attacks - Blocks path traversal attempts - Mitigates XSS via character whitelisting - DoS protection via size limits - No shell command execution of user data Signed-off-by: Luca Muscariello --- .github/scripts/validate_visit_data.py | 267 ++++++++++++++++++++ .github/workflows/process-visits-secure.yml | 54 +++- 2 files changed, 311 insertions(+), 10 deletions(-) create mode 100644 .github/scripts/validate_visit_data.py diff --git a/.github/scripts/validate_visit_data.py b/.github/scripts/validate_visit_data.py new file mode 100644 index 0000000..61d697b --- /dev/null +++ b/.github/scripts/validate_visit_data.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python3 +""" +Secure validation and extraction of visit data from GitHub issue body. + +This script implements security measures: +1. Input validation and sanitization +2. JSON schema validation +3. Size limits +4. Field whitelisting +5. Path traversal prevention +""" + +import sys +import json +import re +from datetime import datetime +from pathlib import Path + +# Security Configuration +MAX_ISSUE_SIZE = 1_000_000 # 1MB max +MAX_VISITS_PER_ISSUE = 100 # Max 100 visits per issue +MAX_PATH_LENGTH = 500 +MAX_REFERRER_LENGTH = 200 +MAX_TIMESTAMP_LENGTH = 30 +ALLOWED_DEVICES = {'mobile', 'tablet', 'desktop'} + +# Expected fields with types +VISIT_SCHEMA = { + 'path': str, + 'ref': str, + 'device': str, + 'ts': str, + 'date': str +} + + +def validate_path(path: str) -> bool: + """Validate URL path to prevent path traversal and injection.""" + if not path or not isinstance(path, str): + return False + + if len(path) > MAX_PATH_LENGTH: + return False + + # Must start with / + if not path.startswith('/'): + return False + + # Check for path traversal attempts + if '..' in path or '~' in path: + return False + + # Only allow safe characters + if not re.match(r'^/[a-zA-Z0-9/_\-\.]*$', path): + return False + + return True + + +def validate_referrer(ref: str) -> bool: + """Validate referrer string.""" + if not ref or not isinstance(ref, str): + return False + + if len(ref) > MAX_REFERRER_LENGTH: + return False + + # Allow 'direct' or domain names + if ref == 'direct': + return True + + # Simple domain validation + if not re.match(r'^[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,}$', ref): + return False + + return True + + +def validate_device(device: str) -> bool: + """Validate device type.""" + if not isinstance(device, str): + return False + + return device.lower() in ALLOWED_DEVICES + + +def validate_timestamp(ts: str) -> bool: + """Validate ISO timestamp.""" + if not ts or not isinstance(ts, str): + return False + + if len(ts) > MAX_TIMESTAMP_LENGTH: + return False + + try: + # Must be valid ISO format + datetime.fromisoformat(ts.replace('Z', '+00:00')) + return True + except (ValueError, AttributeError): + return False + + +def validate_date(date: str) -> bool: + """Validate date string (YYYY-MM-DD).""" + if not date or not isinstance(date, str): + return False + + if not re.match(r'^\d{4}-\d{2}-\d{2}$', date): + return False + + try: + datetime.strptime(date, '%Y-%m-%d') + return True + except ValueError: + return False + + +def validate_visit_record(visit: dict) -> tuple[bool, str]: + """ + Validate a single visit record. + + Returns: + (is_valid, error_message) + """ + # Check all required fields present + for field in VISIT_SCHEMA: + if field not in visit: + return False, f"Missing required field: {field}" + + # No extra fields allowed + for field in visit: + if field not in VISIT_SCHEMA: + return False, f"Unexpected field: {field}" + + # Validate field types + for field, expected_type in VISIT_SCHEMA.items(): + if not isinstance(visit[field], expected_type): + return False, f"Invalid type for {field}: expected {expected_type.__name__}" + + # Validate path + if not validate_path(visit['path']): + return False, f"Invalid path: {visit['path']}" + + # Validate referrer + if not validate_referrer(visit['ref']): + return False, f"Invalid referrer: {visit['ref']}" + + # Validate device + if not validate_device(visit['device']): + return False, f"Invalid device: {visit['device']}" + + # Validate timestamp + if not validate_timestamp(visit['ts']): + return False, f"Invalid timestamp: {visit['ts']}" + + # Validate date + if not validate_date(visit['date']): + return False, f"Invalid date: {visit['date']}" + + return True, "" + + +def extract_jsonl_block(issue_body: str) -> str: + """ + Safely extract JSONL block from issue body. + + Args: + issue_body: The full issue body text + + Returns: + Extracted JSONL content (may be empty) + """ + if not issue_body or not isinstance(issue_body, str): + return "" + + # Size check + if len(issue_body) > MAX_ISSUE_SIZE: + print(f"ERROR: Issue body too large: {len(issue_body)} bytes (max: {MAX_ISSUE_SIZE})", + file=sys.stderr) + return "" + + # Find JSONL code block + pattern = r'```jsonl\s*\n(.*?)\n```' + match = re.search(pattern, issue_body, re.DOTALL) + + if not match: + print("ERROR: No JSONL code block found", file=sys.stderr) + return "" + + return match.group(1).strip() + + +def parse_and_validate_visits(jsonl_content: str) -> list[dict]: + """ + Parse and validate JSONL visit data. + + Args: + jsonl_content: JSONL formatted visit data + + Returns: + List of validated visit records + """ + if not jsonl_content: + return [] + + visits = [] + lines = jsonl_content.strip().split('\n') + + # Check count limit + if len(lines) > MAX_VISITS_PER_ISSUE: + print(f"ERROR: Too many visits: {len(lines)} (max: {MAX_VISITS_PER_ISSUE})", + file=sys.stderr) + return [] + + for line_num, line in enumerate(lines, 1): + line = line.strip() + if not line: + continue + + # Parse JSON + try: + visit = json.loads(line) + except json.JSONDecodeError as e: + print(f"ERROR: Line {line_num}: Invalid JSON: {e}", file=sys.stderr) + continue + + # Validate record + is_valid, error = validate_visit_record(visit) + if not is_valid: + print(f"ERROR: Line {line_num}: {error}", file=sys.stderr) + continue + + visits.append(visit) + + return visits + + +def main(): + """Main execution.""" + # Read issue body from stdin + issue_body = sys.stdin.read() + + # Extract JSONL block + jsonl_content = extract_jsonl_block(issue_body) + + if not jsonl_content: + print("ERROR: No valid JSONL content found", file=sys.stderr) + sys.exit(1) + + # Parse and validate + visits = parse_and_validate_visits(jsonl_content) + + if not visits: + print("ERROR: No valid visits found", file=sys.stderr) + sys.exit(1) + + # Output validated JSONL to stdout + for visit in visits: + print(json.dumps(visit, separators=(',', ':'))) + + # Log success to stderr + print(f"✓ Validated {len(visits)} visits", file=sys.stderr) + + +if __name__ == '__main__': + main() + diff --git a/.github/workflows/process-visits-secure.yml b/.github/workflows/process-visits-secure.yml index 248c24c..737320c 100644 --- a/.github/workflows/process-visits-secure.yml +++ b/.github/workflows/process-visits-secure.yml @@ -29,22 +29,52 @@ jobs: - name: Checkout repository uses: actions/checkout@v4 - - name: Extract visit data from issue + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Validate and extract visit data from issue id: extract + continue-on-error: true env: ISSUE_BODY: ${{ github.event.issue.body }} ISSUE_NUMBER: ${{ github.event.issue.number }} run: | - # Extract JSONL data between ```jsonl markers - echo "$ISSUE_BODY" | sed -n '/```jsonl/,/```/p' | sed '/```/d' > /tmp/visit_data.jsonl + # Use secure validation script + if echo "$ISSUE_BODY" | python3 .github/scripts/validate_visit_data.py > /tmp/visit_data.jsonl 2> /tmp/validation_error.log; then + # Count valid lines + LINES=$(wc -l < /tmp/visit_data.jsonl | tr -d ' ') + echo "Validated and extracted $LINES visit records" + echo "lines=$LINES" >> $GITHUB_OUTPUT + echo "validation_success=true" >> $GITHUB_OUTPUT + else + echo "validation_success=false" >> $GITHUB_OUTPUT + echo "ERROR: Validation failed:" + cat /tmp/validation_error.log + exit 1 + fi + + - name: Close invalid issue + if: steps.extract.outputs.validation_success != 'true' + env: + ISSUE_NUMBER: ${{ github.event.issue.number }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: | + gh issue close $ISSUE_NUMBER \ + --comment "⚠️ This issue was automatically closed because the visit data failed validation. This is likely due to: + + - Invalid JSON format + - Missing required fields + - Invalid data types or values + - Security policy violations - # Count lines - LINES=$(wc -l < /tmp/visit_data.jsonl) - echo "Extracted $LINES visit records" - echo "lines=$LINES" >> $GITHUB_OUTPUT + If you believe this is an error, please contact the maintainers." \ + --repo ${{ github.repository }} + exit 1 - name: Append to Gist - if: steps.extract.outputs.lines > 0 + if: steps.extract.outputs.validation_success == 'true' && steps.extract.outputs.lines > 0 env: GIST_ID: ${{ secrets.VISIT_GIST_ID }} GITHUB_TOKEN: ${{ secrets.VISIT_GIST_TOKEN }} @@ -71,13 +101,17 @@ jobs: echo "✓ Appended ${LINES} visits to Gist" - - name: Close issue + - name: Close issue with success message + if: steps.extract.outputs.validation_success == 'true' env: ISSUE_NUMBER: ${{ github.event.issue.number }} GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | gh issue close $ISSUE_NUMBER \ - --comment "✓ Visit data processed and stored. Thank you!" \ + --comment "✓ Visit data processed and stored securely. Thank you! + + - Visits validated: ${{ steps.extract.outputs.lines }} + - All security checks passed" \ --repo ${{ github.repository }} # Job 2: Generate daily report From 1f6b51235649a16aecf377079d01c14797c99453 Mon Sep 17 00:00:00 2001 From: Luca Muscariello Date: Mon, 13 Oct 2025 17:42:53 +0200 Subject: [PATCH 5/5] refactor(docs): complete removal of test tracking from Taskfile Remove test:tracking tasks from Taskfile.yml: - test:tracking - test:tracking:setup - test:tracking:flow Revert .gitignore changes: - Remove node_modules entry - Remove package-lock.json entry These were part of the test scripts that have been moved to local-only usage. Signed-off-by: Luca Muscariello --- .gitignore | 4 ---- Taskfile.yml | 18 ------------------ 2 files changed, 22 deletions(-) diff --git a/.gitignore b/.gitignore index 3df1d91..cfdf98a 100644 --- a/.gitignore +++ b/.gitignore @@ -17,7 +17,3 @@ generated/ # Python cache __pycache__/ *.pyc - -# Node modules for tracking tests -node_modules/ -package-lock.json diff --git a/Taskfile.yml b/Taskfile.yml index 506858f..255caed 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -51,24 +51,6 @@ tasks: - task: lint - echo "All documentation tests passed!" - test:tracking: - desc: Test visit tracking setup and flow - cmds: - - task: test:tracking:setup - - task: test:tracking:flow - - test:tracking:setup: - desc: Test tracking script is properly loaded and configured - internal: true - cmds: - - bash .github/scripts/test-tracking-simple.sh - - test:tracking:flow: - desc: Simulate the full visit tracking flow - internal: true - cmds: - - bash .github/scripts/test-tracking-flow.sh - lint: desc: Run all linting checks (spelling, markdown) deps: