import os import sys import datetime import argparse import logging import boto3 from botocore.client import Config as BotoConfig from dotenv import load_dotenv from access_tracker import AccessTracker logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', datefmt='%Y-%m-%d %H:%M:%S' ) logger = logging.getLogger(__name__) def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): access_tracker = AccessTracker() required_env_vars = [ 'NICONICOGAY_S3_ACCESS_KEY', 'NICONICOGAY_S3_SECRET_KEY', 'NICONICOGAY_S3_BUCKET_NAME', 'NICONICOGAY_S3_REGION' ] missing_vars = [var for var in required_env_vars if not os.environ.get(var)] if missing_vars: logger.error(f"Missing required environment variables: {', '.join(missing_vars)}") sys.exit(1) try: s3_session = boto3.Session() s3_client = s3_session.client( 's3', aws_access_key_id=os.environ['NICONICOGAY_S3_ACCESS_KEY'], aws_secret_access_key=os.environ['NICONICOGAY_S3_SECRET_KEY'], region_name=os.environ['NICONICOGAY_S3_REGION'], endpoint_url=f"https://{os.environ['NICONICOGAY_S3_REGION']}.digitaloceanspaces.com", config=BotoConfig(s3={'addressing_style': 'virtual'}), ) bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME'] cutoff_timestamp = datetime.datetime.now(datetime.timezone.utc).timestamp() - (days * 24 * 60 * 60) paginator = s3_client.get_paginator('list_objects_v2') page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/") total_files = 0 objects_to_delete = [] access_times = access_tracker.get_all_access_times() for page in page_iterator: if 'Contents' not in page: continue for obj in page['Contents']: total_files += 1 key = obj['Key'] # type: ignore # Extract video_id from S3 key (e.g., "niconico/sm12345.mp4" -> "sm12345") if key.startswith(f"{directory_prefix}/") and key.endswith('.mp4'): video_id = key[len(f"{directory_prefix}/"):-4] # Remove prefix and .mp4 extension last_access = access_times.get(video_id) should_delete = False if last_access is None: # No access record - delete files that haven't been accessed since tracking started # For safety, only delete files older than the cutoff date if obj['LastModified'].timestamp() < cutoff_timestamp: # type: ignore should_delete = True logger.debug(f"Will delete {video_id}: no access record and file is old") elif last_access < cutoff_timestamp: # Has access record but last access was too long ago should_delete = True logger.debug(f"Will delete {video_id}: last accessed {(datetime.datetime.now().timestamp() - last_access) / (24*60*60):.1f} days ago") if should_delete: objects_to_delete.append({'Key': key}) # Remove the access record since we're deleting the file if not dry_run: access_tracker.remove_access_record(video_id) if len(objects_to_delete) == 0: logger.info("No files to delete") return if dry_run: logger.info(f"DRY RUN: Would delete {len(objects_to_delete)} out of {total_files} files") else: # Delete files in batches of 1000 (S3 limit?) for i in range(0, len(objects_to_delete), 1000): batch = objects_to_delete[i:i+1000] s3_client.delete_objects( Bucket=bucket_name, Delete={'Objects': batch} ) logger.info(f"Successfully deleted {len(objects_to_delete)} out of {total_files} files") except Exception as e: logger.error(f"Error: {e}") sys.exit(1) if __name__ == "__main__": load_dotenv() parser = argparse.ArgumentParser() parser.add_argument( "--dry-run", action="store_true", help="Show what would be deleted without actually deleting anything" ) args = parser.parse_args() cleanup_old_files(dry_run=args.dry_run)