package transcode import ( "bufio" "encoding/json" "fmt" "os" "os/exec" "regexp" "strconv" "strings" ) // Segment is a time interval [Start, End) in seconds. type Segment struct { Start float64 `json:"start"` End float64 `json:"end"` } type wsMsg struct { Type string `json:"type"` Segments []Segment `json:"segments,omitempty"` Duration float64 `json:"duration,omitempty"` Percent float64 `json:"percent,omitempty"` Message string `json:"message,omitempty"` } func send(msg *wsMsg, broadcast func([]byte)) { data, _ := json.Marshal(&msg) broadcast(data) } // ────────────────────────────────────────────────────────────────────────────── // Silence detection // ────────────────────────────────────────────────────────────────────────────── // DetectSpeechSegments runs ffmpeg's silencedetect filter and returns the // non-silent (speech) segments. A "segments" WebSocket message is broadcast // when detection finishes. func DetectSpeechSegments(inputPath string, noiseDb, minDuration, padding float64, broadcast func([]byte)) ([]Segment, error) { filter := fmt.Sprintf("silencedetect=noise=%.0fdB:d=%.2f", noiseDb, minDuration) cmd := exec.Command("ffmpeg", "-i", inputPath, "-af", filter, "-f", "null", "-", ) // ffmpeg writes silencedetect output to stderr stderr, err := cmd.StderrPipe() if err != nil { return nil, err } if err := cmd.Start(); err != nil { return nil, fmt.Errorf("ffmpeg start: %w", err) } var ( silences []silenceInterval pendingStart float64 totalDuration float64 durationRe = regexp.MustCompile(`Duration:\s+(\d+):(\d+):([0-9.]+)`) startRe = regexp.MustCompile(`silence_start:\s*([0-9.e+\-]+)`) endRe = regexp.MustCompile(`silence_end:\s*([0-9.e+\-]+)`) ) scanner := bufio.NewScanner(stderr) for scanner.Scan() { line := scanner.Text() if m := durationRe.FindStringSubmatch(line); m != nil { h, _ := strconv.ParseFloat(m[1], 64) min, _ := strconv.ParseFloat(m[2], 64) s, _ := strconv.ParseFloat(m[3], 64) totalDuration = h*3600 + min*60 + s } if m := startRe.FindStringSubmatch(line); m != nil { pendingStart, _ = strconv.ParseFloat(m[1], 64) } if m := endRe.FindStringSubmatch(line); m != nil { end, _ := strconv.ParseFloat(m[1], 64) silences = append(silences, silenceInterval{start: pendingStart, end: end}) } } if err := cmd.Wait(); err != nil { return nil, fmt.Errorf("ffmpeg: %w", err) } segments := removeSilence(silences, totalDuration, padding) send(&wsMsg{ Type: "segments", Segments: segments, Duration: totalDuration, }, broadcast) return segments, nil } type silenceInterval struct{ start, end float64 } // removeSilence turns silence regions into speech regions, with a small // padding buffer so words at the edges don't get clipped. func removeSilence(silences []silenceInterval, totalDuration, padding float64) []Segment { if len(silences) == 0 { return []Segment{{Start: 0, End: totalDuration}} } var segments []Segment cursor := 0.0 for _, s := range silences { segEnd := s.start + padding if segEnd > totalDuration { segEnd = totalDuration } if segEnd-cursor > 0.05 { segments = append(segments, Segment{Start: cursor, End: segEnd}) } next := s.end - padding if next < segEnd { next = segEnd } cursor = next } if cursor < totalDuration-0.01 { segments = append(segments, Segment{Start: cursor, End: totalDuration}) } return segments } func ExportSegments( inputPath, outputPath string, segments []Segment, broadcast func([]byte), ) error { totalDuration := 0.0 for _, seg := range segments { totalDuration += seg.End - seg.Start } // Build a filter_complex that trims each segment and resets timestamps, // then concatenates them. This avoids non-monotonic DTS issues that occur // when stream-copying segments with their original timestamps. var filterParts []string var concatInputs string for i, seg := range segments { filterParts = append(filterParts, fmt.Sprintf("[0:v]trim=start=%.6f:end=%.6f,setpts=PTS-STARTPTS[v%d]", seg.Start, seg.End, i), fmt.Sprintf("[0:a]atrim=start=%.6f:end=%.6f,asetpts=PTS-STARTPTS[a%d]", seg.Start, seg.End, i), ) concatInputs += fmt.Sprintf("[v%d][a%d]", i, i) } filterParts = append(filterParts, fmt.Sprintf("%sconcat=n=%d:v=1:a=1[outv][outa]", concatInputs, len(segments)), ) filterComplex := strings.Join(filterParts, ";") cmd := exec.Command("ffmpeg", "-i", inputPath, "-filter_complex", filterComplex, "-map", "[outv]", "-map", "[outa]", // around 80% compression "-c:v", "libx264", "-crf", "28", "-preset", "fast", "-c:a", "aac", "-b:a", "128k", "-progress", "pipe:1", "-y", outputPath, ) stdout, err := cmd.StdoutPipe() if err != nil { return err } cmd.Stderr = os.Stderr if err := cmd.Start(); err != nil { return fmt.Errorf("ffmpeg start: %w", err) } timeRe := regexp.MustCompile(`out_time_ms=(\d+)`) scanner := bufio.NewScanner(stdout) for scanner.Scan() { if m := timeRe.FindStringSubmatch(scanner.Text()); m != nil { ms, err := strconv.ParseFloat(m[1], 64) if err != nil || ms < 0 { continue } pct := (ms / 1e6) / totalDuration * 100 if pct > 100 { pct = 100 } send(&wsMsg{Type: "progress", Percent: pct}, broadcast) } } if err := cmd.Wait(); err != nil { return fmt.Errorf("ffmpeg: %w", err) } return nil }