Skip to content

Commit 9a64ede

Browse files
committed
New UTF-8 encoding infrastructure under UTF8::encode
1 parent 7569a03 commit 9a64ede

13 files changed

Lines changed: 972 additions & 836 deletions

ChangeLog

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
2026-03-13 Markus Gans <[email protected]>
2+
* New UTF-8 encoding infrastructure under UTF8::encode
3+
14
2026-03-09 Markus Gans <[email protected]>
25
* Improved FOutputBuffer access
36

examples/termcap.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,7 @@ void showString()
297297
{
298298
const auto& name = entry.name;
299299
const auto cap = std::size_t(entry.cap);
300-
tcapString (name, tcap_strings[cap].string);
300+
tcapString (name, tcap_strings[cap].string.data);
301301
}
302302
}
303303

final/ftypes.h

Lines changed: 108 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,9 @@
4141
#include <limits>
4242
#include <memory>
4343
#include <string>
44+
#include <type_traits>
4445
#include <utility>
46+
#include <vector>
4547

4648
#include "final/eventloop/pipedata.h"
4749

@@ -256,47 +258,125 @@ using enable_if_arithmetic_without_char_t =
256258
&& ! std::is_same<char, NumT>::value
257259
, std::nullptr_t>;
258260

259-
// UTF8_Char
260-
//----------------------------------------------------------------------
261-
struct FourByteData
261+
// UTF-8 encoding
262+
namespace UTF8
262263
{
263-
char byte1; // First character
264-
char byte2; // Second character
265-
char byte3; // Third character
266-
char byte4; // Fourth character
267-
};
268264

269-
struct UTF8_Char
265+
inline void expand (std::vector<char>& buffer, std::size_t addend)
270266
{
271-
// Data member
272-
FourByteData u8;
273-
uInt32 length;
267+
buffer.resize(buffer.size() + addend);
268+
}
274269

275-
// Friend Non-member operator functions
276-
friend constexpr auto operator == ( const UTF8_Char& lhs
277-
, const UTF8_Char& rhs ) noexcept -> bool
270+
inline void expand (std::array<char, 4>&, std::size_t)
271+
{ }
272+
273+
template <typename T>
274+
using DecayedT = typename std::decay<T>::type;
275+
276+
template <typename CharBufferT>
277+
using uInt32_if_vector_or_array = std::enable_if_t<
278+
std::is_same<DecayedT<CharBufferT>, std::vector<char>>::value
279+
|| std::is_same<DecayedT<CharBufferT>, std::array<char, 4>>::value
280+
, uInt32>;
281+
282+
#if defined(__CYGWIN__)
283+
284+
template <typename CharBufferT>
285+
inline auto encode (wchar_t ucs, CharBufferT& buffer) -> uInt32_if_vector_or_array<CharBufferT>
286+
{
287+
// Writes UTF-8 bytes to the target array and returns the length
288+
const auto index = std::is_same<CharBufferT, std::vector<char>>::value
289+
? buffer.size()
290+
: 0;
291+
292+
// 1 Byte (7-bit): 0xxxxxxx
293+
if ( ucs < 0x80 )
278294
{
279-
if ( lhs.length != rhs.length )
280-
return false;
295+
expand(buffer, 1);
296+
const auto dest = &buffer[index];
297+
dest[0] = char(ucs);
298+
return 1;
299+
}
300+
301+
// 2 byte (11-bit): 110xxxxx 10xxxxxx
302+
if ( ucs < 0x800 )
303+
{
304+
expand(buffer, 2);
305+
const auto dest = &buffer[index];
306+
dest[0] = char(0xc0 | uChar(ucs >> 6u));
307+
dest[1] = char(0x80 | uChar(ucs & 0x3f));
308+
return 2;
309+
}
310+
311+
// 3 byte (16-bit): 1110xxxx 10xxxxxx 10xxxxxx
312+
expand(buffer, 3);
313+
const auto dest = &buffer[index];
314+
dest[0] = char(0xe0 | uChar(ucs >> 12u));
315+
dest[1] = char(0x80 | uChar((ucs >> 6u) & 0x3f));
316+
dest[2] = char(0x80 | uChar(ucs & 0x3f));
317+
return 3;
318+
}
281319

282-
#if HAVE_BUILTIN(__builtin_bit_cast)
283-
return __builtin_bit_cast(uInt32, lhs.u8) == __builtin_bit_cast(uInt32, rhs.u8);
284320
#else
285-
uInt32 lhs_bytes{};
286-
uInt32 rhs_bytes{};
287-
std::memcpy(&lhs_bytes, &lhs.u8, sizeof(uInt32));
288-
std::memcpy(&rhs_bytes, &rhs.u8, sizeof(uInt32));
289-
return lhs_bytes == rhs_bytes;
290-
#endif
321+
322+
template <typename CharBufferT>
323+
inline auto encode (wchar_t ucs, CharBufferT& buffer) -> uInt32_if_vector_or_array<CharBufferT>
324+
{
325+
// Writes UTF-8 bytes to the target array and returns the length
326+
const auto index = std::is_same<CharBufferT, std::vector<char>>::value
327+
? buffer.size()
328+
: 0;
329+
330+
// 1 Byte (7-bit): 0xxxxxxx
331+
if ( ucs < 0x80 )
332+
{
333+
expand(buffer, 1);
334+
const auto dest = &buffer[index];
335+
dest[0] = char(ucs);
336+
return 1;
291337
}
292338

293-
friend constexpr auto operator != ( const UTF8_Char& lhs
294-
, const UTF8_Char& rhs ) noexcept -> bool
339+
// 2 byte (11-bit): 110xxxxx 10xxxxxx
340+
if ( ucs < 0x800 )
295341
{
296-
return ! ( lhs == rhs );
342+
expand(buffer, 2);
343+
const auto dest = &buffer[index];
344+
dest[0] = char(0xc0 | uChar(ucs >> 6u));
345+
dest[1] = char(0x80 | uChar(ucs & 0x3f));
346+
return 2;
347+
}
348+
349+
// 3 byte (16-bit): 1110xxxx 10xxxxxx 10xxxxxx
350+
if ( ucs < 0x10000 )
351+
{
352+
expand(buffer, 3);
353+
const auto dest = &buffer[index];
354+
dest[0] = char(0xe0 | uChar(ucs >> 12u));
355+
dest[1] = char(0x80 | uChar((ucs >> 6u) & 0x3f));
356+
dest[2] = char(0x80 | uChar(ucs & 0x3f));
357+
return 3;
297358
}
359+
360+
// 4 byte (21-bit): 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
361+
if ( ucs < 0x200000 )
362+
{
363+
expand(buffer, 4);
364+
const auto dest = &buffer[index];
365+
dest[0] = char(0xf0 | uChar(ucs >> 18u));
366+
dest[1] = char(0x80 | uChar((ucs >> 12u) & 0x3f));
367+
dest[2] = char(0x80 | uChar((ucs >> 6u) & 0x3f));
368+
dest[3] = char(0x80 | uChar(ucs & 0x3f));
369+
return 4;
370+
}
371+
372+
return encode(L'', buffer); // Invalid character
373+
}
374+
375+
#endif
376+
298377
};
299378

379+
300380
// FCharAttribute + FAttribute
301381
//----------------------------------------------------------------------
302382
struct FCharAttribute

final/output/tty/fterm.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* *
44
* This file is part of the FINAL CUT widget toolkit *
55
* *
6-
* Copyright 2012-2025 Markus Gans *
6+
* Copyright 2012-2026 Markus Gans *
77
* *
88
* FINAL CUT is free software; you can redistribute it and/or modify *
99
* it under the terms of the GNU Lesser General Public License as *
@@ -356,7 +356,11 @@ inline auto operator << (std::ostream& os, finalcut::UniChar c) -> std::ostream&
356356
static const auto& data = finalcut::FTermData::getInstance();
357357

358358
if ( data.getTerminalEncoding() == finalcut::Encoding::UTF8 )
359-
return os << finalcut::unicode_to_utf8_string(wchar_t(c));
359+
{
360+
std::array<char, 4> buf{};
361+
const uInt32 len = finalcut::UTF8::encode(wchar_t(c), buf);
362+
return os.write(&buf[0], len);
363+
}
360364

361365
return os << static_cast<char>(uChar(c));
362366
}

final/output/tty/fterm_functions.cpp

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -377,13 +377,6 @@ auto unicode_to_cp437 (wchar_t ucs) -> uChar
377377
return c;
378378
}
379379

380-
//----------------------------------------------------------------------
381-
auto unicode_to_utf8_string (wchar_t ucs) -> std::string
382-
{
383-
UTF8_Char ch = unicode_to_utf8(ucs);
384-
return std::string(&ch.u8.byte1, ch.length);
385-
}
386-
387380
//----------------------------------------------------------------------
388381
auto getFullWidth (const FString& str) -> FString
389382
{

final/output/tty/fterm_functions.h

Lines changed: 0 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ auto isReverseNewFontchar (wchar_t) -> bool;
5656
auto hasFullWidthSupports() -> bool;
5757
auto cp437_to_unicode (uChar) -> wchar_t;
5858
auto unicode_to_cp437 (wchar_t) -> uChar;
59-
auto unicode_to_utf8_string (wchar_t) -> std::string;
6059
auto getFullWidth (const FString&) -> FString;
6160
auto getHalfWidth (const FString&) -> FString;
6261
auto getColumnSubString (const FString&, std::size_t, std::size_t) -> FString;
@@ -73,58 +72,6 @@ auto searchLeftCharBegin (const FString&, std::size_t) -> std::size_t;
7372
auto searchRightCharBegin (const FString&, std::size_t) -> std::size_t;
7473
auto readCursorPos() -> FPoint;
7574

76-
//----------------------------------------------------------------------
77-
#if defined(__CYGWIN__)
78-
constexpr auto unicode_to_utf8 (wchar_t ucs) -> UTF8_Char
79-
{
80-
// 1 Byte (7-bit): 0xxxxxxx
81-
if ( ucs < 0x80 )
82-
return { {char(ucs), '\0', '\0', '\0'}, 1 };
83-
84-
// 2 byte (11-bit): 110xxxxx 10xxxxxx
85-
if ( ucs < 0x800 )
86-
return { { char(0xc0 | uChar(ucs >> 6u))
87-
, char(0x80 | uChar(ucs & 0x3f))
88-
, '\0', '\0' }, 2 };
89-
90-
// 3 byte (16-bit): 1110xxxx 10xxxxxx 10xxxxxx
91-
return { { char(0xe0 | uChar(ucs >> 12u))
92-
, char(0x80 | uChar((ucs >> 6u) & 0x3f))
93-
, char(0x80 | uChar(ucs & 0x3f))
94-
, '\0' }, 3 };
95-
}
96-
97-
#else
98-
constexpr auto unicode_to_utf8 (wchar_t ucs) -> UTF8_Char
99-
{
100-
// 1 Byte (7-bit): 0xxxxxxx
101-
if ( ucs < 0x80 )
102-
return { {char(ucs), '\0', '\0', '\0'}, 1 };
103-
104-
// 2 byte (11-bit): 110xxxxx 10xxxxxx
105-
if ( ucs < 0x800 )
106-
return { { char(0xc0 | uChar(ucs >> 6u))
107-
, char(0x80 | uChar(ucs & 0x3f))
108-
, '\0', '\0' }, 2 };
109-
110-
// 3 byte (16-bit): 1110xxxx 10xxxxxx 10xxxxxx
111-
if ( ucs < 0x10000 )
112-
return { { char(0xe0 | uChar(ucs >> 12u))
113-
, char(0x80 | uChar((ucs >> 6u) & 0x3f))
114-
, char(0x80 | uChar(ucs & 0x3f))
115-
, '\0' }, 3 };
116-
117-
// 4 byte (21-bit): 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
118-
if ( ucs < 0x200000 )
119-
return { { char(0xf0 | uChar(ucs >> 18u))
120-
, char(0x80 | uChar((ucs >> 12u) & 0x3f))
121-
, char(0x80 | uChar((ucs >> 6u) & 0x3f))
122-
, char(0x80 | uChar(ucs & 0x3f)) }, 4 };
123-
124-
return unicode_to_utf8(L''); // Invalid character
125-
}
126-
#endif
127-
12875
//----------------------------------------------------------------------
12976
template<std::size_t size, typename UnaryPredicate>
13077
auto captureTerminalInput ( std::array<char, size>& data

0 commit comments

Comments
 (0)