Back to Blog
2026-03-22

The AI Agent Content Pipeline: From Raw Footage to Posted — Fully Automated

The full content workflow — film, ingest, transcribe, extract clips, caption, thumbnail, and post — can run automatically with AI agents. Here's the complete pipeline architecture and code.

The bottleneck for most content creators isn't filming — it's everything that comes after. Editing, captioning, thumbnail creation, writing captions for each platform, posting to 5 accounts. It can take 3-4 hours of post-production per video.

An AI agent pipeline collapses this to about 20 minutes of active work: transfer the footage, kick off the pipeline, review the output, approve and post.

Here's the complete architecture.

Pipeline Overview

1. Ingest          Raw footage → working directory, metadata extracted
2. Transcribe      Whisper → transcript + timestamps + keywords
3. Scene Detect    Shot detection → segment boundaries
4. Clip Extract    Viral moment detection → top clips ranked by score
5. Audio Mix       Denoise + normalize + background music if needed
6. Caption Burn    SRT → styled captions burned into video
7. Thumbnail Gen   Keyframe selection + AI text overlay → thumbnail.jpg
8. Metadata Gen    GPT → platform-specific title, caption, hashtags
9. Export          Platform-optimized encodes (9:16, 16:9, 1:1)
10. Queue Post     Schedule or immediately post to all accounts

Each stage is a discrete function that reads from and writes to a shared job directory. Stages can run in parallel where dependencies allow.

Job Directory Structure

./jobs/job_20260322_001/
├── input/
│   └── raw_footage.mp4
├── meta.json              ← job config + status tracking
├── transcripts/
│   ├── transcript.json    ← full Whisper output with timestamps
│   └── transcript.srt
├── clips/
│   ├── clip_001.mp4       ← extracted highlight clips
│   ├── clip_002.mp4
│   └── scores.json        ← virality scores for each clip
├── audio/
│   └── clean_audio.wav
├── captions/
│   └── styled.srt
├── thumbnails/
│   └── thumbnail_001.jpg
├── export/
│   ├── reel_9x16.mp4      ← final exports
│   ├── youtube_16x9.mp4
│   └── square_1x1.mp4
└── post_metadata.json     ← captions, titles, hashtags per platform

Stage 1: Ingest and Metadata Extraction

import subprocess
import json
import shutil
from pathlib import Path
from datetime import datetime

def ingest_footage(source_path: str, jobs_dir: str = "./jobs") -> dict:
    """
    Copy footage to a new job directory, extract video metadata.
    Returns job config dict.
    """
    # Create job directory
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    job_id = f"job_{timestamp}"
    job_dir = Path(jobs_dir) / job_id
    
    for subdir in ["input", "transcripts", "clips", "audio", "captions", 
                   "thumbnails", "export"]:
        (job_dir / subdir).mkdir(parents=True)
    
    # Copy input file
    input_path = job_dir / "input" / Path(source_path).name
    shutil.copy2(source_path, input_path)
    
    # Extract metadata with ffprobe
    probe = subprocess.run([
        "ffprobe", "-v", "quiet", "-print_format", "json",
        "-show_streams", "-show_format", str(input_path)
    ], capture_output=True, text=True)
    
    probe_data = json.loads(probe.stdout)
    fmt = probe_data.get("format", {})
    
    job = {
        "job_id": job_id,
        "job_dir": str(job_dir),
        "input_path": str(input_path),
        "duration": float(fmt.get("duration", 0)),
        "file_size_mb": int(fmt.get("size", 0)) / 1_000_000,
        "created_at": datetime.now().isoformat(),
        "status": "ingested",
        "stages_complete": []
    }
    
    (job_dir / "meta.json").write_text(json.dumps(job, indent=2))
    print(f"✓ Ingested: {job_id} ({job['duration']:.0f}s, {job['file_size_mb']:.1f}MB)")
    
    return job

Stage 2: Transcription

from faster_whisper import WhisperModel

_whisper_model = None

def get_whisper():
    global _whisper_model
    if _whisper_model is None:
        _whisper_model = WhisperModel("medium", device="cuda", compute_type="float16")
    return _whisper_model

def transcribe_footage(job: dict) -> dict:
    """Transcribe the input footage. Updates job with transcript paths."""
    
    model = get_whisper()
    input_path = job["input_path"]
    transcript_dir = Path(job["job_dir"]) / "transcripts"
    
    segments, info = model.transcribe(
        input_path,
        language="en",
        word_timestamps=True,
        beam_size=5
    )
    
    all_segments = list(segments)
    full_text = " ".join(s.text for s in all_segments).strip()
    
    # Build structured transcript
    transcript_data = {
        "language": info.language,
        "duration": info.duration,
        "text": full_text,
        "segments": [
            {
                "start": s.start,
                "end": s.end,
                "text": s.text.strip(),
                "words": [{"word": w.word, "start": w.start, "end": w.end} 
                          for w in (s.words or [])]
            }
            for s in all_segments
        ]
    }
    
    (transcript_dir / "transcript.json").write_text(json.dumps(transcript_data, indent=2))
    
    # Write SRT
    srt_lines = []
    for i, seg in enumerate(all_segments, 1):
        srt_lines.append(f"{i}\n{_fmt_ts(seg.start)} --> {_fmt_ts(seg.end)}\n{seg.text.strip()}\n")
    (transcript_dir / "transcript.srt").write_text("\n".join(srt_lines))
    
    job["transcript_path"] = str(transcript_dir / "transcript.json")
    job["stages_complete"].append("transcribe")
    return job

def _fmt_ts(seconds: float) -> str:
    from datetime import timedelta
    td = timedelta(seconds=seconds)
    h, rem = divmod(int(td.total_seconds()), 3600)
    m, s = divmod(rem, 60)
    ms = int((td.total_seconds() % 1) * 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

Stage 3: Viral Clip Extraction

import openai
import re

client = openai.OpenAI()

def extract_viral_clips(job: dict, max_clips: int = 5) -> dict:
    """
    Use GPT to identify the best clips from the transcript.
    Returns timestamp ranges + scores.
    """
    with open(job["transcript_path"]) as f:
        transcript = json.load(f)
    
    # Build transcript with timestamps for GPT
    timestamped = "\n".join(
        f"[{seg['start']:.1f}s-{seg['end']:.1f}s]: {seg['text']}"
        for seg in transcript["segments"]
    )
    
    prompt = f"""
    This is a video transcript with timestamps. Identify the {max_clips} best clips for social media.
    
    Transcript:
    {timestamped[:4000]}
    
    For each clip, provide:
    - start_time: float (seconds)
    - end_time: float (seconds, clip should be 15-60 seconds max)
    - hook: the opening line that makes people stop scrolling
    - virality_score: 1-10 (10 = extremely shareable)
    - reason: why this clip works
    
    Prioritize: strong hooks, emotional moments, surprising facts, how-to segments.
    Return as JSON array under key "clips".
    """
    
    resp = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"},
        temperature=0.3
    )
    
    clips_data = json.loads(resp.choices[0].message.content).get("clips", [])
    clips_dir = Path(job["job_dir"]) / "clips"
    
    # Extract each clip with ffmpeg
    for i, clip in enumerate(clips_data):
        output_path = clips_dir / f"clip_{i+1:03d}.mp4"
        duration = clip["end_time"] - clip["start_time"]
        
        subprocess.run([
            "ffmpeg", "-y",
            "-ss", str(clip["start_time"]),
            "-i", job["input_path"],
            "-t", str(duration),
            "-c:v", "libx264", "-crf", "18",
            "-c:a", "aac", "-b:a", "128k",
            str(output_path)
        ], capture_output=True)
        
        clip["file_path"] = str(output_path)
    
    (clips_dir / "scores.json").write_text(json.dumps(clips_data, indent=2))
    job["clips"] = clips_data
    job["stages_complete"].append("clip_extract")
    return job

Stage 4: Platform Metadata Generation

def generate_post_metadata(job: dict, brand: str = "nepa_ai") -> dict:
    """Generate platform-specific captions, titles, and hashtags."""
    
    with open(job["transcript_path"]) as f:
        transcript = json.load(f)
    
    full_text = transcript["text"][:1500]
    
    prompt = f"""
    Brand: {brand}
    Video transcript summary: {full_text}
    
    Generate post metadata for each platform. Return JSON with keys:
    - youtube: {{title, description (500 words), tags: list}}
    - instagram: {{caption (2200 chars max), hashtags: list of 30}}
    - tiktok: {{caption (150 chars max), hashtags: list of 10}}
    - twitter: {{tweet (280 chars), hashtags: list of 5}}
    
    Tone: professional but conversational. Include a CTA on each platform.
    For YouTube, include timestamps based on the transcript.
    """
    
    resp = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}],
        response_format={"type": "json_object"},
        temperature=0.6
    )
    
    metadata = json.loads(resp.choices[0].message.content)
    
    meta_path = Path(job["job_dir"]) / "post_metadata.json"
    meta_path.write_text(json.dumps(metadata, indent=2))
    
    job["post_metadata_path"] = str(meta_path)
    job["stages_complete"].append("metadata")
    return job

Stage 5: Export and Queue

def export_for_platforms(job: dict, clip_index: int = 0) -> dict:
    """Export video in platform-optimized formats."""
    
    clip_files = sorted(Path(job["job_dir"]).glob("clips/clip_*.mp4"))
    if not clip_files:
        raise ValueError("No clips found — run extract_viral_clips first")
    
    source = str(clip_files[clip_index])
    export_dir = Path(job["job_dir"]) / "export"
    
    exports = {
        "reel_9x16.mp4": ["-vf", "scale=1080:1920:force_original_aspect_ratio=decrease,pad=1080:1920:(ow-iw)/2:(oh-ih)/2"],
        "youtube_16x9.mp4": ["-vf", "scale=1920:1080:force_original_aspect_ratio=decrease,pad=1920:1080:(ow-iw)/2:(oh-ih)/2"],
        "square_1x1.mp4": ["-vf", "scale=1080:1080:force_original_aspect_ratio=decrease,pad=1080:1080:(ow-iw)/2:(oh-ih)/2"],
    }
    
    for filename, vf_args in exports.items():
        output = str(export_dir / filename)
        subprocess.run([
            "ffmpeg", "-y", "-i", source,
            *vf_args,
            "-c:v", "libx264", "-crf", "20", "-preset", "fast",
            "-c:a", "aac", "-b:a", "128k",
            output
        ], capture_output=True)
        print(f"  ✓ Exported: {filename}")
    
    job["exports"] = {k: str(export_dir / k) for k in exports}
    job["stages_complete"].append("export")
    return job


## Full Pipeline Runner

def run_full_pipeline(footage_path: str, brand: str = "nepa_ai"):
    """Execute the complete film-to-post pipeline."""
    
    print(f"\n🎬 Starting content pipeline for: {footage_path}")
    
    job = ingest_footage(footage_path)
    job = transcribe_footage(job)
    job = extract_viral_clips(job, max_clips=3)
    job = generate_post_metadata(job, brand=brand)
    job = export_for_platforms(job, clip_index=0)
    
    job["status"] = "ready_to_post"
    Path(job["job_dir"] + "/meta.json").write_text(json.dumps(job, indent=2))
    
    print(f"\n✅ Pipeline complete: {job['job_id']}")
    print(f"   Clips extracted: {len(job['clips'])}")
    print(f"   Exports: {list(job['exports'].keys())}")
    print(f"   Review at: {job['job_dir']}")
    
    return job

# Kick it off
job = run_full_pipeline("./footage/todays_recording.mp4", brand="nepa_ai")

The full pipeline runs in about 8-12 minutes for a 30-minute recording on modern hardware. You review the outputs, make any edits, and approve for posting.


The NEPA AI Content Creator Stack packages the complete pipeline — ingest, transcribe, clip extraction, captioning, thumbnail generation, metadata writing, and multi-platform posting — into a single agent-driven workflow.

→ Get the Content Creator AI Stack at /shop/content-creator-ai-stack

Film it. The agent handles the rest.