Skip to content

Commit 8ef434d

Browse files
committed
fix(ls): use GetACP to detect UTF-8 encoding on Windows
On Windows, locale environment variables (LC_ALL, LC_COLLATE, LANG) are typically unset, causing get_locale_from_env() to default to UEncoding::Ascii. This makes non-ASCII filenames display as octal escape sequences or `?` characters in ls output. Fix by querying the system ANSI code page via GetACP() when no locale variables are set. If the active code page is 65001 (UTF-8), use UEncoding::Utf8. This aligns with GNU coreutils' gnulib approach which calls locale_charset() -> GetACP() on Windows. Fixes: #11103
1 parent 5605eac commit 8ef434d

2 files changed

Lines changed: 108 additions & 2 deletions

File tree

src/uucore/src/lib/features/i18n/mod.rs

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,24 @@ pub enum UEncoding {
2828
// This ensures real locales like "en-US" won't match
2929
const DEFAULT_LOCALE: Locale = locale!("und");
3030

31+
/// On Windows, detect the encoding from the system ANSI code page.
32+
/// Returns `UEncoding::Utf8` if the active code page is 65001 (UTF-8),
33+
/// otherwise `UEncoding::Ascii`.
34+
///
35+
/// This mirrors the GNU lib approach where `locale_charset()` calls `GetACP()` on Windows.
36+
#[cfg(target_os = "windows")]
37+
fn get_windows_encoding() -> UEncoding {
38+
unsafe extern "system" {
39+
fn GetACP() -> u32;
40+
}
41+
let acp = unsafe { GetACP() };
42+
if acp == 65001 {
43+
UEncoding::Utf8
44+
} else {
45+
UEncoding::Ascii
46+
}
47+
}
48+
3149
/// Look at 3 environment variables in the following order
3250
///
3351
/// 1. LC_ALL
@@ -70,8 +88,18 @@ pub fn get_locale_from_env(locale_name: &str) -> (Locale, UEncoding) {
7088
return (locale, encoding);
7189
}
7290
}
73-
// Default POSIX locale representing LC_ALL=C
74-
(DEFAULT_LOCALE, UEncoding::Ascii)
91+
// No locale environment variables set.
92+
// On Windows, check the system ANSI code page to determine encoding,
93+
// matching GNU coreutils' approach (locale_charset -> GetACP).
94+
#[cfg(target_os = "windows")]
95+
{
96+
(DEFAULT_LOCALE, get_windows_encoding())
97+
}
98+
#[cfg(not(target_os = "windows"))]
99+
{
100+
// Default POSIX locale representing LC_ALL=C
101+
(DEFAULT_LOCALE, UEncoding::Ascii)
102+
}
75103
}
76104

77105
/// Get the collating locale from the environment

tests/by-util/test_ls.rs

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7195,3 +7195,81 @@ fn test_ls_a_dotdot_no_error_on_wasi() {
71957195
.stdout_contains("..")
71967196
.no_stderr();
71977197
}
7198+
7199+
/// Verify that ls correctly detects encoding from locale environment variables.
7200+
/// Non-ASCII filenames should be escaped in C/POSIX/non-UTF-8 locales
7201+
/// and displayed as-is in UTF-8 locales.
7202+
#[cfg(not(any(target_vendor = "apple", target_os = "windows", target_os = "openbsd")))]
7203+
mod locale_encoding {
7204+
use uutests::util::TestScenario;
7205+
use uutests::util_name;
7206+
7207+
/// Create a file with a non-ASCII name and check ls output with the given locale.
7208+
/// If `expect_utf8` is true, assert the filename is shown as-is (UTF-8 locale).
7209+
/// Otherwise, assert the non-ASCII character is escaped (ASCII locale).
7210+
fn check_locale(locale: &str, expect_utf8: bool) {
7211+
let scene = TestScenario::new(util_name!());
7212+
let at = &scene.fixtures;
7213+
let filename = uucore::os_str_from_bytes("é".as_bytes())
7214+
.expect("should be valid Unicode");
7215+
at.touch(filename);
7216+
7217+
let result = scene
7218+
.ucmd()
7219+
.env("LC_ALL", locale)
7220+
.arg("--quoting-style=shell-escape")
7221+
.succeeds();
7222+
7223+
if expect_utf8 {
7224+
result.stdout_contains("é");
7225+
} else {
7226+
result.stdout_does_not_contain("é");
7227+
}
7228+
}
7229+
7230+
#[test]
7231+
fn test_ls_locale_c_escapes_non_ascii() {
7232+
check_locale("C", false);
7233+
}
7234+
7235+
#[test]
7236+
fn test_ls_locale_posix_escapes_non_ascii() {
7237+
check_locale("POSIX", false);
7238+
}
7239+
7240+
#[test]
7241+
fn test_ls_locale_utf8_suffix_shows_non_ascii() {
7242+
check_locale("en_US.UTF-8", true);
7243+
}
7244+
7245+
#[test]
7246+
fn test_ls_locale_utf8_lowercase_shows_non_ascii() {
7247+
check_locale("en_US.utf8", true);
7248+
}
7249+
7250+
#[test]
7251+
fn test_ls_locale_iso8859_escapes_non_ascii() {
7252+
check_locale("en_US.ISO-8859-1", false);
7253+
}
7254+
7255+
#[test]
7256+
fn test_ls_locale_no_encoding_suffix_escapes_non_ascii() {
7257+
check_locale("en_US", false);
7258+
}
7259+
}
7260+
7261+
/// On Windows, verify that ls can display non-ASCII filenames correctly
7262+
/// when the system ANSI code page is set to UTF-8 (ACP 65001).
7263+
#[cfg(target_os = "windows")]
7264+
#[test]
7265+
fn test_ls_windows_non_ascii_filename() {
7266+
let scene = TestScenario::new(util_name!());
7267+
let at = &scene.fixtures;
7268+
at.touch("文件1");
7269+
7270+
scene
7271+
.ucmd()
7272+
.succeeds()
7273+
.stdout_contains("文件1")
7274+
.no_stderr();
7275+
}

0 commit comments

Comments
 (0)