Manual video editing is a pain. This blog shows how to automate it with Python and some tools. YOLO v8 finds action moments, Whisper transcribes speech at word level, ffmpeg does the cutting—all stitched together for fast edits.
The Core Idea
Instead of scrubbing, the pipeline:
- Scans every frame using YOLO
- Scores each second based on detection confidence and object count
- Transcribes audio with Whisper, getting word-level timestamps
- Identifies highlight windows using a sliding score algorithm
- Cuts and exports segments with captions burned in
No manual tagging. No timeline scrubbing.
Setup
pip install ultralytics openai-whisper ffmpeg-python numpy pandas
# Also need ffmpeg binary
sudo apt install ffmpeg # Ubuntu/Debian
Step 1: YOLO Frame Scoring
import cv2
import numpy as np
from ultralytics import YOLO
from pathlib import Path
def score_video_frames(
video_path: str,
model_size: str = "yolov8m.pt", # n, s, m, l, x
target_classes: list = None, # None = all classes
sample_rate: int = 5, # Check every Nth frame
confidence_threshold: float = 0.4
) -> dict:
model = YOLO(model_size)
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
frame_scores = {}
frame_num = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_num % sample_rate == 0:
results = model(frame, verbose=False, conf=confidence_threshold)
score = 0
for r in results:
boxes = r.boxes
if boxes is not None:
for box in boxes:
cls_id = int(box.cls[0])
cls_name = model.names[cls_id]
conf = float(box.conf[0])
# Filter by target classes if specified
if target_classes and cls_name not in target_classes:
continue
# Score: confidence + bonus for multiple objects
score += conf * 10
timestamp = frame_num / fps
frame_scores[timestamp] = score
frame_num += 1
cap.release()
return frame_scores
# Example: prioritize person, bicycle detection
scores = score_video_frames(
"raw_footage.mp4",
model_size="yolov8m.pt",
target_classes=["person", "bicycle"],
sample_rate=3,
confidence_threshold=0.35
)
print(f"Scored {len(scores)} timestamps")
print(f"Peak action at: {max(scores, key=scores.get):.1f}s (score: {max(scores.values()):.1f})")
Step 2: Whisper Transcription with Word Timestamps
import whisper
import json
def transcribe_with_timestamps(
video_path: str,
model_size: str = "small" # tiny, base, small, medium, large
) -> dict:
model = whisper.load_model(model_size)
result = model.transcribe(
video_path,
word_timestamps=True # Critical: enables per-word timing
)
words = []
for segment in result["segments"]:
if "words" in segment:
for word in segment["words"]:
words.append({
"word": word["word"].strip(),
"start": word["start"],
"end": word["end"],
"probability": word.get("probability", 1.0)
})
return {
"text": result["text"],
"language": result["language"],
"words": words,
"segments": result["segments"]
}
transcript = transcribe_with_timestamps("raw_footage.mp4")
print(f"Language detected: {transcript['language']}")
print(f"Total words: {len(transcript['words'])}")
print(f"First 5 words: {[w['word'] for w in transcript['words'][0:5]]}")
Step 3: Find Highlight Windows
from collections import defaultdict
import pandas as pd
def find_highlight_windows(
frame_scores: dict,
transcript_words: list,
min_duration: float = 3.0, # Minimum clip length (seconds)
max_duration: float = 15.0, # Maximum clip length
top_n: int = 10, # Number of highlights to extract
score_window: float = 2.0, # Seconds to average scores over
silence_penalty: float = 0.3 # Reduce score during silence
) -> list:
all_times = sorted(frame_scores.keys())
timeline = defaultdict(float)
for t, score in frame_scores.items():
bin_t = int(t)
timeline[bin_t] += score
speech_bins = defaultdict(float)
for word in transcript_words:
bin_t = int(word["start"])
speech_bins[bin_t] += word["probability"]
combined_scores = {}
for t in range(int(max(all_times))):
window_score = sum(timeline.get(t + i, 0) for i in range(-1, 2))
speech_bonus = speech_bins.get(t, 0) * 2.0
combined_scores[t] = window_score + speech_bonus
highlights = []
used_ranges = []
sorted_times = sorted(combined_scores, key=combined_scores.get, reverse=True)
for t in sorted_times:
if len(highlights) >= top_n:
break
start = max(0, t - 1)
end = min(int(max(all_times)), t + max_duration)
overlaps = any(not (end <= s or start >= e) for s, e in used_ranges)
if overlaps:
continue
window_words = [w for w in transcript_words
if w["start"] >= start and w["end"] <= end]
if window_words:
start = max(0, window_words[0]["start"] - 0.3)
end = min(int(max(all_times)), window_words[-1]["end"] + 0.3)
duration = end - start
if min_duration <= duration <= max_duration:
highlights.append({
"start": round(start, 2),
"end": round(end, 2),
"duration": round(duration, 2),
"score": combined_scores[t],
"words": [w["word"] for w in window_words]
})
used_ranges.append((start, end))
highlights.sort(key=lambda x: x["start"])
return highlights
highlights = find_highlight_windows(
scores,
transcript["words"],
min_duration=4.0,
max_duration=12.0,
top_n=8
)
for h in highlights:
print(f" [{h['start']:.1f}s → {h['end']:.1f}s] score={h['score']:.1f} | {' '.join(h['words'][0:6])}...")
Step 4: Export Clips with Burned-In Captions
import ffmpeg
import os
def export_highlight_clips(
source_video: str,
highlights: list,
transcript_words: list,
output_dir: str = "./highlights",
burn_captions: bool = True,
caption_style: str = "fontsize=36:fontcolor=white:borderw=2:bordercolor=black"
) -> list:
os.makedirs(output_dir, exist_ok=True)
exported = []
for i, clip in enumerate(highlights):
output_path = f"{output_dir}/highlight_{i+1:02d}_{clip['start']:.0f}s.mp4"
if burn_captions:
# Build SRT subtitle
clip_words = [w for w in transcript_words
if w["start"] >= clip["start"] and w["end"] <= clip["end"] + 0.5]
srt_path = f"{output_dir}/clip_{i+1}.srt"
with open(srt_path, "w") as f:
chunk_size = 4
for j in range(0, len(clip_words), chunk_size):
chunk = clip_words[j:j+chunk_size]
start_ts = chunk[0]["start"] - clip["start"]
end_ts = chunk[-1]["end"] - clip["start"]
text = " ".join(w["word"] for w in chunk)
f.write(f"{j//chunk_size + 1}\n")
f.write(f"{to_srt_time(max(0, start_ts))} --> {to_srt_time(end_ts)}\n")
f.write(f"{text}\n\n")
(
ffmpeg
.input(source_video, ss=clip["start"], to=clip["end"])
.output(
output_path,
vf=f"subtitles={srt_path}:force_style='{caption_style}'",
acodec="aac",
vcodec="libx264",
crf=18,
preset="fast"
)
.overwrite_output()
.run(quiet=True)
)
else:
(
ffmpeg
.input(source_video, ss=clip["start"], to=clip["end"])
.output(output_path, acodec="aac", vcodec="libx264", crf=18)
.overwrite_output()
.run(quiet=True)
)
print(f"✓ Exported: {output_path}")
exported.append(output_path)
return exported
clips = export_highlight_clips(
source_video="raw_footage.mp4",
highlights=highlights,
transcript_words=transcript["words"],
output_dir="./tiktok_clips",
burn_captions=True
)
print(f"\nExported {len(clips)} clips from {len(highlights)} highlights")
Full Pipeline in One Call
def run_auto_edit_pipeline(
video_path: str,
output_dir: str = "./auto_edited",
yolo_model: str = "yolov8m.pt",
whisper_model: str = "small",
target_classes: list = None,
n_highlights: int = 8
):
print("Step 1/4: Scoring frames with YOLO...")
scores = score_video_frames(video_path, yolo_model, target_classes)
print("Step 2/4: Transcribing with Whisper...")
transcript = transcribe_with_timestamps(video_path, whisper_model)
print("Step 3/4: Finding highlights...")
highlights = find_highlight_windows(scores, transcript["words"], top_n=n_highlights)
print(f" Found {len(highlights)} highlight windows")
print("Step 4/4: Exporting clips...")
clips = export_highlight_clips(video_path, highlights, transcript["words"], output_dir)
return {"highlights": highlights, "clips": clips, "transcript": transcript}
# Run it
result = run_auto_edit_pipeline(
video_path="30min_raw_footage.mp4",
output_dir="./tiktok_clips",
target_classes=["person", "bicycle"],
n_highlights=6
)
The Video Workspace productizes this entire pipeline with 97 lines of config and 30+ editing methods. 30 minutes of footage → 3 platform-ready clips in 3 minutes.



