package transcode import ( "bufio" "encoding/json" "fmt" "os" "os/exec" "regexp" "strconv" ) // Segment is a time interval [Start, End) in seconds. type Segment struct { Start float64 `json:"start"` End float64 `json:"end"` } type wsMsg struct { Type string `json:"type"` Segments []Segment `json:"segments,omitempty"` Duration float64 `json:"duration,omitempty"` Percent float64 `json:"percent,omitempty"` Message string `json:"message,omitempty"` } func send(msg wsMsg, broadcast func([]byte)) { data, _ := json.Marshal(msg) broadcast(data) } // ────────────────────────────────────────────────────────────────────────────── // Silence detection // ────────────────────────────────────────────────────────────────────────────── // DetectSpeechSegments runs ffmpeg's silencedetect filter and returns the // non-silent (speech) segments. A "segments" WebSocket message is broadcast // when detection finishes. func DetectSpeechSegments( inputPath string, noiseDb, minDuration, padding float64, broadcast func([]byte), ) ([]Segment, error) { filter := fmt.Sprintf("silencedetect=noise=%.0fdB:d=%.2f", noiseDb, minDuration) cmd := exec.Command("ffmpeg", "-i", inputPath, "-af", filter, "-f", "null", "-", ) // ffmpeg writes silencedetect output to stderr stderr, err := cmd.StderrPipe() if err != nil { return nil, err } if err := cmd.Start(); err != nil { return nil, fmt.Errorf("ffmpeg start: %w", err) } var ( silences []silenceInterval pendingStart float64 totalDuration float64 durationRe = regexp.MustCompile(`Duration:\s+(\d+):(\d+):([0-9.]+)`) startRe = regexp.MustCompile(`silence_start:\s*([0-9.e+\-]+)`) endRe = regexp.MustCompile(`silence_end:\s*([0-9.e+\-]+)`) ) scanner := bufio.NewScanner(stderr) for scanner.Scan() { line := scanner.Text() if m := durationRe.FindStringSubmatch(line); m != nil { h, _ := strconv.ParseFloat(m[1], 64) min, _ := strconv.ParseFloat(m[2], 64) s, _ := strconv.ParseFloat(m[3], 64) totalDuration = h*3600 + min*60 + s } if m := startRe.FindStringSubmatch(line); m != nil { pendingStart, _ = strconv.ParseFloat(m[1], 64) } if m := endRe.FindStringSubmatch(line); m != nil { end, _ := strconv.ParseFloat(m[1], 64) silences = append(silences, silenceInterval{start: pendingStart, end: end}) } } if err := cmd.Wait(); err != nil { return nil, fmt.Errorf("ffmpeg: %w", err) } segments := invertSilences(silences, totalDuration, padding) send(wsMsg{ Type: "segments", Segments: segments, Duration: totalDuration, }, broadcast) return segments, nil } type silenceInterval struct{ start, end float64 } // invertSilences turns silence regions into speech regions, with a small // padding buffer so words at the edges don't get clipped. func invertSilences(silences []silenceInterval, totalDuration, padding float64) []Segment { if len(silences) == 0 { return []Segment{{Start: 0, End: totalDuration}} } var segments []Segment cursor := 0.0 for _, s := range silences { segEnd := s.start + padding if segEnd > totalDuration { segEnd = totalDuration } if segEnd-cursor > 0.05 { segments = append(segments, Segment{Start: cursor, End: segEnd}) } next := s.end - padding if next < segEnd { next = segEnd } cursor = next } if cursor < totalDuration-0.01 { segments = append(segments, Segment{Start: cursor, End: totalDuration}) } return segments } // ────────────────────────────────────────────────────────────────────────────── // Export // ────────────────────────────────────────────────────────────────────────────── // ExportSegments concatenates the given segments using ffmpeg's concat demuxer // (stream-copy, no re-encode). Progress is streamed via broadcast. func ExportSegments( inputPath, outputPath string, segments []Segment, broadcast func([]byte), ) error { concatPath := outputPath + ".concat.txt" f, err := os.Create(concatPath) if err != nil { return fmt.Errorf("create concat file: %w", err) } defer os.Remove(concatPath) totalDuration := 0.0 for _, seg := range segments { totalDuration += seg.End - seg.Start fmt.Fprintf(f, "file '%s'\ninpoint %.6f\noutpoint %.6f\n", inputPath, seg.Start, seg.End) } f.Close() cmd := exec.Command("ffmpeg", "-f", "concat", "-safe", "0", "-i", concatPath, "-c", "copy", "-progress", "pipe:1", "-y", outputPath, ) stdout, err := cmd.StdoutPipe() if err != nil { return err } cmd.Stderr = os.Stderr if err := cmd.Start(); err != nil { return fmt.Errorf("ffmpeg start: %w", err) } timeRe := regexp.MustCompile(`out_time_ms=(\d+)`) scanner := bufio.NewScanner(stdout) for scanner.Scan() { if m := timeRe.FindStringSubmatch(scanner.Text()); m != nil { ms, err := strconv.ParseFloat(m[1], 64) if err != nil || ms < 0 { continue } pct := (ms / 1e6) / totalDuration * 100 if pct > 100 { pct = 100 } send(wsMsg{Type: "progress", Percent: pct}, broadcast) } } if err := cmd.Wait(); err != nil { return fmt.Errorf("ffmpeg: %w", err) } return nil }