Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
284 changes: 284 additions & 0 deletions .github/scripts/process_visits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,284 @@
#!/usr/bin/env python3
"""
Process website visits from GitHub Gist and generate daily report

This script:
1. Fetches visit data from GitHub Gist
2. Aggregates data by day, page, referrer
3. Generates a daily report
4. Archives processed data
"""

import os
import sys
import json
from datetime import datetime, timezone
from collections import defaultdict, Counter
from pathlib import Path

# Configuration
GIST_ID = os.environ.get('GIST_ID', 'YOUR_GIST_ID_HERE')
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN', '')

SCRIPT_DIR = Path(__file__).parent
REPORT_FILE = SCRIPT_DIR / 'visit_report.md'
STATS_FILE = SCRIPT_DIR / 'visit_stats.json'
ARCHIVE_DIR = SCRIPT_DIR / 'visit_archive'


def fetch_gist_data():
"""Fetch visit data from GitHub Gist."""
import urllib.request

url = f'https://api.github.com/gists/{GIST_ID}'
headers = {
'Accept': 'application/vnd.github.v3+json'
}

if GITHUB_TOKEN:
headers['Authorization'] = f'Bearer {GITHUB_TOKEN}'

req = urllib.request.Request(url, headers=headers)

try:
with urllib.request.urlopen(req) as response:
gist = json.loads(response.read().decode())

# Get first file content
filename = list(gist['files'].keys())[0]
content = gist['files'][filename]['content']

return content, filename
except Exception as e:
print(f"Error fetching gist: {e}", file=sys.stderr)
return None, None


def parse_visits(content):
"""Parse JSONL content into visit records."""
visits = []

if not content:
return visits

for line in content.strip().split('\n'):
if not line:
continue
try:
visit = json.loads(line)
visits.append(visit)
except json.JSONDecodeError as e:
print(f"Warning: Skipping invalid line: {e}", file=sys.stderr)

return visits


def aggregate_visits(visits):
"""Aggregate visits by various dimensions."""
stats = {
'total_visits': len(visits),
'by_date': defaultdict(int),
'by_page': Counter(),
'by_referrer': Counter(),
'by_device': Counter(),
'unique_dates': set(),
'date_range': {'start': None, 'end': None}
}

for visit in visits:
date = visit.get('date', '')
path = visit.get('path', '/')
ref = visit.get('ref', 'direct')
device = visit.get('device', 'desktop')

if date:
stats['by_date'][date] += 1
stats['unique_dates'].add(date)

stats['by_page'][path] += 1
stats['by_referrer'][ref] += 1
stats['by_device'][device] += 1

# Calculate date range
if stats['unique_dates']:
sorted_dates = sorted(stats['unique_dates'])
stats['date_range']['start'] = sorted_dates[0]
stats['date_range']['end'] = sorted_dates[-1]

# Convert sets to lists for JSON serialization
stats['unique_dates'] = len(stats['unique_dates'])
stats['by_date'] = dict(stats['by_date'])
stats['by_page'] = dict(stats['by_page'].most_common(20))
stats['by_referrer'] = dict(stats['by_referrer'].most_common(10))
stats['by_device'] = dict(stats['by_device'])

return stats


def generate_report(stats):
"""Generate a markdown report."""
now = datetime.now(timezone.utc)

report = f"""# Website Visit Report - docs.agntcy.org

**Generated**: {now.strftime('%Y-%m-%d %H:%M:%S UTC')}

## Summary

- **Total Visits**: {stats['total_visits']:,}
- **Unique Days**: {stats['unique_dates']}
- **Date Range**: {stats['date_range']['start']} to {stats['date_range']['end']}

## Top Pages

| Page | Visits |
|------|-------:|
"""

for page, count in list(stats['by_page'].items())[:15]:
report += f"| `{page}` | {count:,} |\n"

report += "\n## Top Referrers\n\n| Referrer | Visits |\n|----------|-------:|\n"

for ref, count in list(stats['by_referrer'].items())[:10]:
report += f"| {ref} | {count:,} |\n"

report += "\n## Device Distribution\n\n| Device | Visits | Percentage |\n|--------|-------:|-----------:|\n"

total = stats['total_visits']
for device, count in stats['by_device'].items():
pct = (count / total * 100) if total > 0 else 0
report += f"| {device.capitalize()} | {count:,} | {pct:.1f}% |\n"

report += "\n## Daily Visits (Last 30 Days)\n\n| Date | Visits |\n|------|-------:|\n"

sorted_dates = sorted(stats['by_date'].keys(), reverse=True)[:30]
for date in sorted_dates:
count = stats['by_date'][date]
report += f"| {date} | {count:,} |\n"

report += "\n---\n*Data collected from docs.agntcy.org visits*\n"

return report


def save_report(report):
"""Save report to file."""
with open(REPORT_FILE, 'w', encoding='utf-8') as f:
f.write(report)
print(f"✓ Report saved to {REPORT_FILE}")


def save_stats(stats):
"""Save statistics as JSON."""
stats['last_updated'] = datetime.now(timezone.utc).isoformat()

with open(STATS_FILE, 'w', encoding='utf-8') as f:
json.dump(stats, f, indent=2)
print(f"✓ Statistics saved to {STATS_FILE}")


def archive_data(content, filename):
"""Archive processed data."""
if not content:
return

ARCHIVE_DIR.mkdir(exist_ok=True)

timestamp = datetime.now(timezone.utc).strftime('%Y%m%d_%H%M%S')
archive_file = ARCHIVE_DIR / f"{filename}_{timestamp}.jsonl"

with open(archive_file, 'w', encoding='utf-8') as f:
f.write(content)

print(f"✓ Data archived to {archive_file}")


def clear_gist():
"""Clear the gist after processing (optional)."""
import urllib.request

if not GITHUB_TOKEN:
print("No GitHub token, skipping gist clear")
return

url = f'https://api.github.com/gists/{GIST_ID}'

# Get filename
content, filename = fetch_gist_data()
if not filename:
return

# Clear content
data = json.dumps({
'files': {
filename: {
'content': '# Processed - waiting for new data\n'
}
}
}).encode()

req = urllib.request.Request(
url,
data=data,
method='PATCH',
headers={
'Accept': 'application/vnd.github.v3+json',
'Authorization': f'Bearer {GITHUB_TOKEN}',
'Content-Type': 'application/json'
}
)

try:
with urllib.request.urlopen(req) as response:
print("✓ Gist cleared")
except Exception as e:
print(f"Warning: Failed to clear gist: {e}", file=sys.stderr)


def main():
"""Main execution."""
print("Processing website visits from GitHub Gist...\n")

# Fetch data
content, filename = fetch_gist_data()

if not content:
print("No data to process")
return

print(f"Fetched {len(content)} bytes from Gist")

# Parse visits
visits = parse_visits(content)
print(f"Parsed {len(visits)} visits")

if len(visits) == 0:
print("No visits to process")
return

# Aggregate
stats = aggregate_visits(visits)

# Generate report
report = generate_report(stats)
save_report(report)

# Save stats
save_stats(stats)

# Archive data
archive_data(content, filename.replace('.jsonl', ''))

# Clear gist (optional - comment out if you want to keep data)
# clear_gist()

print(f"\n✓ Processing complete!")
print(f" Total visits: {stats['total_visits']:,}")
print(f" Date range: {stats['date_range']['start']} to {stats['date_range']['end']}")


if __name__ == '__main__':
main()


Loading
Loading