Merge proper speech-to-text feature gating from PR #3006

StarbirdTech · claude · StarbirdTech · commit 3be4e893c0a6 · 2026-01-29T01:09:24.000-05:00
Resolves conflicts by taking the new feature-gated implementation which:
- Uses #[cfg(feature = "speech-to-text")] instead of #[cfg(all(feature = "ffmpeg", feature = "whisper"))]
- Properly integrates with the ffmpeg feature hierarchy

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -24,7 +24,7 @@ concurrency:
 jobs:
   rustfmt:
     name: Rust Formatting
-    runs-on: ubuntu-22.04
+    runs-on: blacksmith-4vcpu-ubuntu-2204
     timeout-minutes: 10
     permissions:
       contents: read
@@ -94,7 +94,7 @@ jobs:
       matrix:
         settings:
           - host: ubuntu-22.04
-            target: x86_64-unknown-linux-gnu
+            target: blacksmith-4vcpu-ubuntu-2404
     name: Clippy (${{ matrix.settings.host }})
     runs-on: ${{ matrix.settings.host }}
     permissions:
@@ -164,7 +164,7 @@ jobs:
 
   typescript:
     name: TypeScript
-    runs-on: ubuntu-22.04
+    runs-on: blacksmith-4vcpu-ubuntu-2204
     timeout-minutes: 15
     permissions:
       contents: read
diff --git a/.github/workflows/core_tests.yml b/.github/workflows/core_tests.yml
@@ -22,7 +22,7 @@ jobs:
             target: aarch64-apple-darwin
             os: macos
           - host: ubuntu-22.04
-            target: x86_64-unknown-linux-gnu
+            target: blacksmith-4vcpu-ubuntu-2404
             os: linux
           - host: [self-hosted, Windows, X64]
             target: x86_64-pc-windows-msvc
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -65,7 +65,7 @@ jobs:
 
   #     - name: Build CLI binaries
   #       run: |
-  #         cargo build --release --bin sd-cli --bin sd-daemon --features heif,ffmpeg --target ${{ matrix.target }}
+  #         cargo build --release --bin sd-cli --bin sd-daemon --features heif,ffmpeg,ai --target ${{ matrix.target }}
   #       env:
   #         # Set linker for cross-compilation
   #         CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER: aarch64-linux-gnu-gcc
@@ -111,7 +111,7 @@ jobs:
             target: x86_64-unknown-linux-gnu
             platform: linux-x86_64
           - host: ubuntu-22.04
-            target: aarch64-unknown-linux-gnu
+            target: blacksmith-4vcpu-ubuntu-2404-arm
             platform: linux-aarch64
     name: Server - ${{ matrix.settings.platform }}
     runs-on: ${{ matrix.settings.host }}
@@ -144,7 +144,7 @@ jobs:
 
       - name: Build server binary
         run: |
-          cargo build --release --bin sd-server --features sd-core/heif,sd-core/ffmpeg --target ${{ matrix.settings.target }}
+          cargo build --release --bin sd-server --features sd-core/heif,sd-core/ffmpeg,sd-core/ai --target ${{ matrix.settings.target }}
         env:
           CARGO_TARGET_AARCH64_UNKNOWN_LINUX_GNU_LINKER: aarch64-linux-gnu-gcc
 
@@ -194,7 +194,7 @@ jobs:
           #   arch: x86_64
           # Linux builds
           - host: ubuntu-22.04
-            target: x86_64-unknown-linux-gnu
+            target: blacksmith-4vcpu-ubuntu-2404
             bundles: deb
             os: linux
             arch: x86_64
@@ -309,7 +309,7 @@ jobs:
   # Create unified release with Server, CLI, and Desktop artifacts
   release:
     if: startsWith(github.ref, 'refs/tags/')
-    runs-on: self-hosted
+    runs-on: blacksmith-4vcpu-ubuntu-2404
     name: Create Release
     needs: [server-build, desktop-main]
     permissions:
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
@@ -13,7 +13,7 @@ on:
 jobs:
   build-server:
     name: Build a docker image for spacedrive server
-    runs-on: ubuntu-latest
+    runs-on: blacksmith-4vcpu-ubuntu-2404
     defaults:
       run:
         shell: bash
diff --git a/apps/cli/Cargo.toml b/apps/cli/Cargo.toml
@@ -7,6 +7,9 @@ version = "2.0.0-pre.1"
 default = []
 heif = ["sd-core/heif"]
 ffmpeg = ["sd-core/ffmpeg"]
+whisper = ["sd-core/whisper"]
+speech-to-text = ["sd-core/speech-to-text"]
+ai = ["sd-core/ai"]
 
 [dependencies]
 anyhow      = "1"
diff --git a/apps/server/Cargo.toml b/apps/server/Cargo.toml
@@ -7,6 +7,9 @@ edition = "2021"
 default = []
 heif = ["sd-core/heif"]
 ffmpeg = ["sd-core/ffmpeg"]
+whisper = ["sd-core/whisper"]
+speech-to-text = ["sd-core/speech-to-text"]
+ai = ["sd-core/ai"]
 
 [dependencies]
 # Spacedrive core
diff --git a/core/Cargo.toml b/core/Cargo.toml
@@ -6,10 +6,14 @@ autobins = true
 
 [features]
 default = ["wasm"]
-# FFmpeg support for video thumbnails and audio transcription
+# FFmpeg support for video thumbnails and audio extraction
 ffmpeg = ["dep:sd-ffmpeg"]
-# AI models support
-ai = []
+# Whisper speech recognition engine (internal dependency)
+whisper = ["dep:whisper-rs", "dep:hound", "dep:rubato"]
+# Speech-to-text transcription (requires audio extraction + recognition)
+speech-to-text = ["ffmpeg", "whisper"]
+# AI features umbrella (heavy deps, can be disabled for lite builds or mobile)
+ai = ["speech-to-text"]
 # HEIF image support (extends sd-images with HEIF format)
 heif = ["sd-images/heif"]
 # Mobile platform support (excludes wasm which doesn't work on iOS)
@@ -18,8 +22,6 @@ mobile = []
 cli = []
 # WASM plugin system (disabled on mobile)
 wasm = ["dep:wasmer", "dep:wasmer-middlewares"]
-# Whisper speech-to-text support (disabled on Android due to BLAS cross-compilation issues)
-whisper = ["dep:whisper-rs", "dep:hound", "dep:rubato"]
 
 
 [dependencies]
@@ -126,10 +128,10 @@ sd-media-metadata = { path = "../crates/media-metadata" }
 tokio-rustls     = "0.26"
 webp             = "0.3"
 
-# Speech-to-text dependencies (optional - disabled on Android due to cross-compilation issues)
+# Speech-to-text dependencies (optional, behind whisper feature)
 whisper-rs = { version = "0.15.1", optional = true }
-hound      = { version = "3.5", optional = true }  # WAV file reading
-rubato     = { version = "0.16", optional = true } # Audio resampling to 16kHz
+hound      = { version = "3.5", optional = true }   # WAV file reading
+rubato     = { version = "0.16", optional = true }  # Audio resampling to 16kHz
 
 # Networking
 # Iroh P2P networking
diff --git a/core/src/domain/location.rs b/core/src/domain/location.rs
@@ -539,7 +539,7 @@ impl Default for SpeechPolicy {
 
 impl SpeechPolicy {
 	/// Convert this policy to a SpeechToTextJobConfig for job dispatch
-	#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+	#[cfg(feature = "speech-to-text")]
 	pub fn to_job_config(
 		&self,
 		location_id: Option<Uuid>,
diff --git a/core/src/ops/indexing/change_detection/persistent.rs b/core/src/ops/indexing/change_detection/persistent.rs
@@ -405,11 +405,13 @@ impl ChangeHandler for DatabaseAdapter {
 		use crate::ops::indexing::processor::{
 			load_location_processor_config, ContentHashProcessor, ProcessorEntry,
 		};
-		#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
-		use crate::ops::media::speech::SpeechToTextProcessor;
 		use crate::ops::media::{ocr::OcrProcessor, proxy::ProxyProcessor};
 		#[cfg(feature = "ffmpeg")]
-		use crate::ops::media::{thumbnail::ThumbnailProcessor, thumbstrip::ThumbstripProcessor};
+		use crate::ops::media::{
+			thumbnail::ThumbnailProcessor, thumbstrip::ThumbstripProcessor,
+		};
+		#[cfg(feature = "speech-to-text")]
+		use crate::ops::media::speech::SpeechToTextProcessor;
 
 		if entry.is_directory() {
 			return Ok(());
@@ -583,7 +585,7 @@ impl ChangeHandler for DatabaseAdapter {
 		}
 
 		// Speech-to-text
-		#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+		#[cfg(feature = "speech-to-text")]
 		if proc_config
 			.watcher_processors
 			.iter()
diff --git a/core/src/ops/locations/trigger_job/action.rs b/core/src/ops/locations/trigger_job/action.rs
@@ -168,7 +168,7 @@ impl LibraryAction for LocationTriggerJobAction {
 				})?
 			}
 
-			#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+			#[cfg(feature = "speech-to-text")]
 			JobType::SpeechToText => {
 				if !job_policies.speech_to_text.enabled && !self.input.force {
 					return Err(ActionError::Validation {
@@ -198,13 +198,11 @@ impl LibraryAction for LocationTriggerJobAction {
 				});
 			}
 
-			#[cfg(not(all(feature = "ffmpeg", feature = "whisper")))]
+			#[cfg(not(feature = "speech-to-text"))]
 			JobType::SpeechToText => {
 				return Err(ActionError::Validation {
 					field: "job_type".to_string(),
-					message:
-						"Speech-to-text requires FFmpeg and Whisper support which is not enabled"
-							.to_string(),
+					message: "Speech-to-text requires FFmpeg and Whisper support which is not enabled".to_string(),
 				});
 			}
 
diff --git a/core/src/ops/media/mod.rs b/core/src/ops/media/mod.rs
@@ -30,7 +30,7 @@ pub use ocr::{OcrJob, OcrProcessor};
 pub use proxy::{ProxyJob, ProxyProcessor};
 pub use splat::{GaussianSplatJob, GaussianSplatProcessor};
 
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 pub use speech::{SpeechToTextJob, SpeechToTextProcessor};
 #[cfg(feature = "ffmpeg")]
 pub use thumbnail::ThumbnailJob;
diff --git a/core/src/ops/media/speech/action.rs b/core/src/ops/media/speech/action.rs
@@ -54,7 +54,7 @@ impl LibraryAction for TranscribeAudioAction {
 		library: Arc<crate::library::Library>,
 		_context: Arc<CoreContext>,
 	) -> Result<Self::Output, ActionError> {
-		#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+		#[cfg(feature = "speech-to-text")]
 		{
 			tracing::info!(
 				"Dispatching speech-to-text job for entry: {}",
@@ -89,7 +89,7 @@ impl LibraryAction for TranscribeAudioAction {
 			})
 		}
 
-		#[cfg(not(all(feature = "ffmpeg", feature = "whisper")))]
+		#[cfg(not(feature = "speech-to-text"))]
 		{
 			Err(ActionError::InvalidInput(
 				"Speech-to-text feature is not enabled. Please rebuild with --features ffmpeg,whisper".to_string()
diff --git a/core/src/ops/media/speech/mod.rs b/core/src/ops/media/speech/mod.rs
@@ -3,28 +3,28 @@
 //! Transcribes audio/video to text using whisper.rs.
 //! Generates .srt subtitle files as sidecars.
 //!
-//! Requires both `ffmpeg` and `whisper` features to be enabled.
+//! Requires the `speech-to-text` feature (enables `ffmpeg` + `whisper`).
 
 pub mod action;
 
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 pub mod job;
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 pub mod processor;
 
 pub use action::{TranscribeAudioAction, TranscribeAudioInput, TranscribeAudioOutput};
 
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 pub use job::{SpeechToTextJob, SpeechToTextJobConfig};
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 pub use processor::SpeechToTextProcessor;
 
 use anyhow::{Context, Result};
 use std::path::Path;
 use std::sync::Arc;
 
 /// Transcribe audio/video to text using whisper
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 pub async fn transcribe_audio_file(
 	source_path: &Path,
 	model: &str,
@@ -99,15 +99,15 @@ pub async fn transcribe_audio_file(
 
 /// Load audio file and convert to 16kHz mono f32 samples required by Whisper
 /// Uses FFmpeg libraries directly (no subprocess)
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 fn load_audio_samples(path: &Path) -> Result<Vec<f32>> {
 	// Use sd-ffmpeg to extract audio samples directly
 	// This returns 16kHz mono f32 PCM samples, exactly what Whisper needs
 	Ok(sd_ffmpeg::extract_audio_samples(path)?)
 }
 
 /// Format a single SRT subtitle segment
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 fn format_srt_segment(index: usize, start: f64, end: f64, text: &str) -> String {
 	let start_time = format_srt_timestamp(start);
 	let end_time = format_srt_timestamp(end);
@@ -122,7 +122,7 @@ fn format_srt_segment(index: usize, start: f64, end: f64, text: &str) -> String
 }
 
 /// Format timestamp in SRT format (HH:MM:SS,mmm)
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 fn format_srt_timestamp(seconds: f64) -> String {
 	let hours = (seconds / 3600.0).floor() as u32;
 	let minutes = ((seconds % 3600.0) / 60.0).floor() as u32;
@@ -146,7 +146,7 @@ pub fn is_speech_supported(mime_type: &str, registry: &crate::filetype::FileType
 }
 
 /// Get audio duration in seconds using ffprobe (public for job progress estimation)
-#[cfg(all(feature = "ffmpeg", feature = "whisper"))]
+#[cfg(feature = "speech-to-text")]
 pub async fn get_audio_duration_public(path: &Path) -> Result<f32> {
 	use std::process::Command;