@@ -25,6 +25,14 @@ use url::Url;
2525use crate :: storage:: error:: StorageError :: { InvalidPath , UrlParseError } ;
2626use crate :: storage:: Result ;
2727
28+ /// Normalizes path separators to forward slashes for cross-platform consistency.
29+ ///
30+ /// Windows paths use backslashes, but for consistency with URLs and storage operations,
31+ /// we normalize all paths to use forward slashes.
32+ fn normalize_path_separators ( path : & str ) -> String {
33+ path. replace ( '\\' , "/" )
34+ }
35+
2836/// Parses a URI string into a URL.
2937pub fn parse_uri ( uri : & str ) -> Result < Url > {
3038 // Check for Windows drive paths first to prevent them from being parsed as URLs
@@ -33,10 +41,10 @@ pub fn parse_uri(uri: &str) -> Result<Url> {
3341 Url :: from_directory_path ( uri)
3442 . map_err ( |_| UrlParseError ( url:: ParseError :: EmptyHost ) ) ?
3543 } else {
36- match Url :: parse ( uri ) {
37- Ok ( url ) => url ,
38- Err ( e ) => Url :: from_directory_path ( uri) . map_err ( |_| UrlParseError ( e) ) ? ,
39- }
44+ // Try parsing as URL first, fall back to directory path for local paths
45+ Url :: parse ( uri ) . or_else ( |e| {
46+ Url :: from_directory_path ( uri) . map_err ( |_| UrlParseError ( e) )
47+ } ) ?
4048 } ;
4149
4250 if url. path ( ) . ends_with ( '/' ) {
@@ -52,6 +60,30 @@ pub fn get_scheme_authority(url: &Url) -> String {
5260 format ! ( "{}://{}" , url. scheme( ) , url. authority( ) )
5361}
5462
63+ /// Converts a URL to a path string suitable for file system operations.
64+ ///
65+ /// For file:// URLs, this returns the decoded native file path to avoid
66+ /// percent-encoding issues (e.g., ~ being encoded as %7E on Windows).
67+ /// For other schemes (s3://, gs://, etc.), returns the URL path as-is.
68+ ///
69+ /// # Arguments
70+ /// * `url` - The URL to convert
71+ ///
72+ /// # Returns
73+ /// A path string suitable for file operations
74+ pub fn url_to_path_string ( url : & Url ) -> Result < String > {
75+ if url. scheme ( ) == "file" {
76+ // Use to_file_path() to properly decode file URLs and avoid percent-encoding issues
77+ // This is critical on Windows where ~ gets encoded as %7E in URL paths
78+ url. to_file_path ( )
79+ . map_err ( |_| InvalidPath ( format ! ( "Failed to convert file URL to path: {}" , url) ) )
80+ . map ( |p| p. to_string_lossy ( ) . to_string ( ) )
81+ } else {
82+ // For cloud storage URLs (s3://, gs://, etc.), use the URL path directly
83+ Ok ( url. path ( ) . to_string ( ) )
84+ }
85+ }
86+
5587/// Joins a base URL with a list of segments.
5688///
5789/// # Arguments
@@ -113,7 +145,7 @@ pub fn join_base_path_with_relative_path(base: &str, relative: &str) -> Result<S
113145/// * `segments` - Path segments to append
114146///
115147/// # Returns
116- /// The joined path as a string, or an error if the input is invalid
148+ /// The joined path as a string with forward slashes , or an error if the input is invalid
117149pub fn join_base_path_with_segments ( base : & str , segments : & [ & str ] ) -> Result < String > {
118150 // 1) Windows absolute path check FIRST (before URL parsing)
119151 // This prevents "C:/foo" from being parsed as a URL with scheme "C"
@@ -127,7 +159,8 @@ pub fn join_base_path_with_segments(base: &str, segments: &[&str]) -> Result<Str
127159 }
128160 }
129161 }
130- return Ok ( path. to_string_lossy ( ) . into_owned ( ) ) ;
162+ let path_str = path. to_string_lossy ( ) ;
163+ return Ok ( normalize_path_separators ( & path_str) ) ;
131164 }
132165
133166 // 2) Try URL mode: parse base as absolute URL
@@ -164,7 +197,8 @@ pub fn join_base_path_with_segments(base: &str, segments: &[&str]) -> Result<Str
164197 }
165198 }
166199 }
167- Ok ( path. to_string_lossy ( ) . into_owned ( ) )
200+ let path_str = path. to_string_lossy ( ) ;
201+ Ok ( normalize_path_separators ( & path_str) )
168202}
169203
170204/// Detects if a string starts like a Windows drive path: `C:\` or `C:/`
@@ -224,15 +258,41 @@ mod tests {
224258 assert ! ( url. path( ) . contains( "test" ) ) ;
225259 assert ! ( url. path( ) . contains( "data" ) ) ;
226260
227- // Windows path with drive letter only
228- let url = parse_uri ( "D:\ \ " ) . unwrap ( ) ;
261+ // Windows path with drive letter and trailing backslash
262+ let url = parse_uri ( r "D:\") . unwrap ( ) ;
229263 assert_eq ! ( url. scheme( ) , "file" ) ;
230264
231- // Windows path with drive letter and colon
232- let url = parse_uri ( "E: " ) . unwrap ( ) ;
265+ // Windows path with drive letter and trailing forward slash
266+ let url = parse_uri ( "D:/ " ) . unwrap ( ) ;
233267 assert_eq ! ( url. scheme( ) , "file" ) ;
234268 }
235269
270+ #[ test]
271+ fn test_url_to_path_string_with_file_url ( ) {
272+ // Test that url_to_path_string properly decodes file:// URLs
273+ // This is critical on Windows where ~ gets encoded as %7E in URL paths
274+ let path_with_tilde = "/tmp/RUNNER~1/test.parquet" ;
275+ let url = Url :: from_directory_path ( path_with_tilde) . unwrap ( ) ;
276+
277+ let path_str = url_to_path_string ( & url) . unwrap ( ) ;
278+
279+ // The path should contain the original tilde (not %7E)
280+ assert ! ( path_str. contains( "RUNNER~1" ) ) ;
281+ assert ! ( !path_str. contains( "%7E" ) ) ;
282+ }
283+
284+ #[ test]
285+ fn test_url_to_path_string_with_cloud_url ( ) {
286+ // Test that url_to_path_string returns URL path for cloud storage
287+ let s3_url = Url :: from_str ( "s3://bucket/path/to/file.parquet" ) . unwrap ( ) ;
288+ let path_str = url_to_path_string ( & s3_url) . unwrap ( ) ;
289+ assert_eq ! ( path_str, "/path/to/file.parquet" ) ;
290+
291+ let gs_url = Url :: from_str ( "gs://bucket/data/file.parquet" ) . unwrap ( ) ;
292+ let path_str = url_to_path_string ( & gs_url) . unwrap ( ) ;
293+ assert_eq ! ( path_str, "/data/file.parquet" ) ;
294+ }
295+
236296 #[ test]
237297 fn join_base_url_with_segments ( ) {
238298 let base_url = Url :: from_str ( "file:///base" ) . unwrap ( ) ;
@@ -316,13 +376,15 @@ mod tests {
316376
317377 #[ test]
318378 fn test_join_base_path_with_segments_windows ( ) {
319- // Windows path with backslash
379+ // Windows path with backslash - should normalize to forward slashes
320380 let result =
321381 join_base_path_with_segments ( r"C:\Users\test" , & [ "Documents" , "file.txt" ] ) . unwrap ( ) ;
322382 assert ! ( result. contains( "Users" ) ) ;
323383 assert ! ( result. contains( "test" ) ) ;
324384 assert ! ( result. contains( "Documents" ) ) ;
325385 assert ! ( result. contains( "file.txt" ) ) ;
386+ // Verify normalized to forward slashes
387+ assert ! ( result. contains( "Users/test/Documents/file.txt" ) || result. contains( "C:/Users" ) ) ;
326388
327389 // Windows path with forward slash
328390 let result =
@@ -331,20 +393,26 @@ mod tests {
331393 assert ! ( result. contains( "test" ) ) ;
332394 assert ! ( result. contains( "Documents" ) ) ;
333395 assert ! ( result. contains( "file.txt" ) ) ;
396+ // Verify normalized to forward slashes
397+ assert ! ( result. contains( "Users/test/Documents/file.txt" ) || result. contains( "C:/Users" ) ) ;
334398
335- // Windows path with mixed separators in segments
399+ // Windows path with mixed separators in segments - should normalize
336400 let result =
337401 join_base_path_with_segments ( r"D:\data" , & [ "dir/subdir" , r"more\files" ] ) . unwrap ( ) ;
338402 assert ! ( result. contains( "data" ) ) ;
339403 assert ! ( result. contains( "dir" ) ) ;
340404 assert ! ( result. contains( "subdir" ) ) ;
341405 assert ! ( result. contains( "more" ) ) ;
342406 assert ! ( result. contains( "files" ) ) ;
407+ // Verify normalized to forward slashes
408+ assert ! ( !result. contains( '\\' ) ) ;
343409
344410 // Just drive letter
345411 let result = join_base_path_with_segments ( "C:" , & [ "temp" , "file.txt" ] ) . unwrap ( ) ;
346412 assert ! ( result. contains( "temp" ) ) ;
347413 assert ! ( result. contains( "file.txt" ) ) ;
414+ // Verify normalized to forward slashes
415+ assert ! ( !result. contains( '\\' ) ) ;
348416 }
349417
350418 #[ test]
0 commit comments