Skip to content

Commit 284e078

Browse files
committed
update
1 parent 141a0bc commit 284e078

3 files changed

Lines changed: 89 additions & 17 deletions

File tree

crates/core/src/storage/mod.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -283,10 +283,12 @@ pub async fn get_leaf_dirs(storage: &Storage, subdir: Option<&str>) -> Result<Ve
283283
next_subdir.push(curr);
284284
}
285285
next_subdir.push(child_dir);
286-
let next_subdir = next_subdir
286+
let next_subdir_str = next_subdir
287287
.to_str()
288288
.ok_or_else(|| InvalidPath(format!("Failed to convert path: {:?}", next_subdir)))?;
289-
let curr_leaf_dir = get_leaf_dirs(storage, Some(next_subdir)).await?;
289+
// Normalize to forward slashes for consistency across platforms
290+
let next_subdir_normalized = next_subdir_str.replace('\\', "/");
291+
let curr_leaf_dir = get_leaf_dirs(storage, Some(&next_subdir_normalized)).await?;
290292
leaf_dirs.extend(curr_leaf_dir);
291293
}
292294
}

crates/core/src/storage/util.rs

Lines changed: 81 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,14 @@ use url::Url;
2525
use crate::storage::error::StorageError::{InvalidPath, UrlParseError};
2626
use crate::storage::Result;
2727

28+
/// Normalizes path separators to forward slashes for cross-platform consistency.
29+
///
30+
/// Windows paths use backslashes, but for consistency with URLs and storage operations,
31+
/// we normalize all paths to use forward slashes.
32+
fn normalize_path_separators(path: &str) -> String {
33+
path.replace('\\', "/")
34+
}
35+
2836
/// Parses a URI string into a URL.
2937
pub fn parse_uri(uri: &str) -> Result<Url> {
3038
// Check for Windows drive paths first to prevent them from being parsed as URLs
@@ -33,10 +41,10 @@ pub fn parse_uri(uri: &str) -> Result<Url> {
3341
Url::from_directory_path(uri)
3442
.map_err(|_| UrlParseError(url::ParseError::EmptyHost))?
3543
} else {
36-
match Url::parse(uri) {
37-
Ok(url) => url,
38-
Err(e) => Url::from_directory_path(uri).map_err(|_| UrlParseError(e))?,
39-
}
44+
// Try parsing as URL first, fall back to directory path for local paths
45+
Url::parse(uri).or_else(|e| {
46+
Url::from_directory_path(uri).map_err(|_| UrlParseError(e))
47+
})?
4048
};
4149

4250
if url.path().ends_with('/') {
@@ -52,6 +60,30 @@ pub fn get_scheme_authority(url: &Url) -> String {
5260
format!("{}://{}", url.scheme(), url.authority())
5361
}
5462

63+
/// Converts a URL to a path string suitable for file system operations.
64+
///
65+
/// For file:// URLs, this returns the decoded native file path to avoid
66+
/// percent-encoding issues (e.g., ~ being encoded as %7E on Windows).
67+
/// For other schemes (s3://, gs://, etc.), returns the URL path as-is.
68+
///
69+
/// # Arguments
70+
/// * `url` - The URL to convert
71+
///
72+
/// # Returns
73+
/// A path string suitable for file operations
74+
pub fn url_to_path_string(url: &Url) -> Result<String> {
75+
if url.scheme() == "file" {
76+
// Use to_file_path() to properly decode file URLs and avoid percent-encoding issues
77+
// This is critical on Windows where ~ gets encoded as %7E in URL paths
78+
url.to_file_path()
79+
.map_err(|_| InvalidPath(format!("Failed to convert file URL to path: {}", url)))
80+
.map(|p| p.to_string_lossy().to_string())
81+
} else {
82+
// For cloud storage URLs (s3://, gs://, etc.), use the URL path directly
83+
Ok(url.path().to_string())
84+
}
85+
}
86+
5587
/// Joins a base URL with a list of segments.
5688
///
5789
/// # Arguments
@@ -113,7 +145,7 @@ pub fn join_base_path_with_relative_path(base: &str, relative: &str) -> Result<S
113145
/// * `segments` - Path segments to append
114146
///
115147
/// # Returns
116-
/// The joined path as a string, or an error if the input is invalid
148+
/// The joined path as a string with forward slashes, or an error if the input is invalid
117149
pub fn join_base_path_with_segments(base: &str, segments: &[&str]) -> Result<String> {
118150
// 1) Windows absolute path check FIRST (before URL parsing)
119151
// This prevents "C:/foo" from being parsed as a URL with scheme "C"
@@ -127,7 +159,8 @@ pub fn join_base_path_with_segments(base: &str, segments: &[&str]) -> Result<Str
127159
}
128160
}
129161
}
130-
return Ok(path.to_string_lossy().into_owned());
162+
let path_str = path.to_string_lossy();
163+
return Ok(normalize_path_separators(&path_str));
131164
}
132165

133166
// 2) Try URL mode: parse base as absolute URL
@@ -164,7 +197,8 @@ pub fn join_base_path_with_segments(base: &str, segments: &[&str]) -> Result<Str
164197
}
165198
}
166199
}
167-
Ok(path.to_string_lossy().into_owned())
200+
let path_str = path.to_string_lossy();
201+
Ok(normalize_path_separators(&path_str))
168202
}
169203

170204
/// Detects if a string starts like a Windows drive path: `C:\` or `C:/`
@@ -224,15 +258,41 @@ mod tests {
224258
assert!(url.path().contains("test"));
225259
assert!(url.path().contains("data"));
226260

227-
// Windows path with drive letter only
228-
let url = parse_uri("D:\\").unwrap();
261+
// Windows path with drive letter and trailing backslash
262+
let url = parse_uri(r"D:\").unwrap();
229263
assert_eq!(url.scheme(), "file");
230264

231-
// Windows path with drive letter and colon
232-
let url = parse_uri("E:").unwrap();
265+
// Windows path with drive letter and trailing forward slash
266+
let url = parse_uri("D:/").unwrap();
233267
assert_eq!(url.scheme(), "file");
234268
}
235269

270+
#[test]
271+
fn test_url_to_path_string_with_file_url() {
272+
// Test that url_to_path_string properly decodes file:// URLs
273+
// This is critical on Windows where ~ gets encoded as %7E in URL paths
274+
let path_with_tilde = "/tmp/RUNNER~1/test.parquet";
275+
let url = Url::from_directory_path(path_with_tilde).unwrap();
276+
277+
let path_str = url_to_path_string(&url).unwrap();
278+
279+
// The path should contain the original tilde (not %7E)
280+
assert!(path_str.contains("RUNNER~1"));
281+
assert!(!path_str.contains("%7E"));
282+
}
283+
284+
#[test]
285+
fn test_url_to_path_string_with_cloud_url() {
286+
// Test that url_to_path_string returns URL path for cloud storage
287+
let s3_url = Url::from_str("s3://bucket/path/to/file.parquet").unwrap();
288+
let path_str = url_to_path_string(&s3_url).unwrap();
289+
assert_eq!(path_str, "/path/to/file.parquet");
290+
291+
let gs_url = Url::from_str("gs://bucket/data/file.parquet").unwrap();
292+
let path_str = url_to_path_string(&gs_url).unwrap();
293+
assert_eq!(path_str, "/data/file.parquet");
294+
}
295+
236296
#[test]
237297
fn join_base_url_with_segments() {
238298
let base_url = Url::from_str("file:///base").unwrap();
@@ -316,13 +376,15 @@ mod tests {
316376

317377
#[test]
318378
fn test_join_base_path_with_segments_windows() {
319-
// Windows path with backslash
379+
// Windows path with backslash - should normalize to forward slashes
320380
let result =
321381
join_base_path_with_segments(r"C:\Users\test", &["Documents", "file.txt"]).unwrap();
322382
assert!(result.contains("Users"));
323383
assert!(result.contains("test"));
324384
assert!(result.contains("Documents"));
325385
assert!(result.contains("file.txt"));
386+
// Verify normalized to forward slashes
387+
assert!(result.contains("Users/test/Documents/file.txt") || result.contains("C:/Users"));
326388

327389
// Windows path with forward slash
328390
let result =
@@ -331,20 +393,26 @@ mod tests {
331393
assert!(result.contains("test"));
332394
assert!(result.contains("Documents"));
333395
assert!(result.contains("file.txt"));
396+
// Verify normalized to forward slashes
397+
assert!(result.contains("Users/test/Documents/file.txt") || result.contains("C:/Users"));
334398

335-
// Windows path with mixed separators in segments
399+
// Windows path with mixed separators in segments - should normalize
336400
let result =
337401
join_base_path_with_segments(r"D:\data", &["dir/subdir", r"more\files"]).unwrap();
338402
assert!(result.contains("data"));
339403
assert!(result.contains("dir"));
340404
assert!(result.contains("subdir"));
341405
assert!(result.contains("more"));
342406
assert!(result.contains("files"));
407+
// Verify normalized to forward slashes
408+
assert!(!result.contains('\\'));
343409

344410
// Just drive letter
345411
let result = join_base_path_with_segments("C:", &["temp", "file.txt"]).unwrap();
346412
assert!(result.contains("temp"));
347413
assert!(result.contains("file.txt"));
414+
// Verify normalized to forward slashes
415+
assert!(!result.contains('\\'));
348416
}
349417

350418
#[test]

crates/datafusion/src/lib.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ use datafusion_physical_expr::create_physical_expr;
4646
use crate::util::expr::exprs_to_filters;
4747
use hudi_core::config::read::HudiReadConfig::InputPartitions;
4848
use hudi_core::config::util::empty_options;
49-
use hudi_core::storage::util::{get_scheme_authority, join_url_segments};
49+
use hudi_core::storage::util::{get_scheme_authority, join_url_segments, url_to_path_string};
5050
use hudi_core::table::Table as HudiTable;
5151

5252
/// Create a `HudiDataSource`.
@@ -195,7 +195,9 @@ impl TableProvider for HudiDataSource {
195195
let url = join_url_segments(&base_url, &[relative_path.as_str()])
196196
.map_err(|e| Execution(format!("Failed to join URL segments: {e:?}")))?;
197197
let size = f.base_file.file_metadata.as_ref().map_or(0, |m| m.size);
198-
let partitioned_file = PartitionedFile::new(url.path(), size as u64);
198+
let path_str = url_to_path_string(&url)
199+
.map_err(|e| Execution(format!("Failed to convert URL to path: {e:?}")))?;
200+
let partitioned_file = PartitionedFile::new(path_str, size as u64);
199201
parquet_file_group_vec.push(partitioned_file);
200202
}
201203
parquet_file_groups.push(parquet_file_group_vec)

0 commit comments

Comments
 (0)