-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathencoding.rs
More file actions
140 lines (124 loc) · 5.38 KB
/
Copy pathencoding.rs
File metadata and controls
140 lines (124 loc) · 5.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
//! Encoding traits for character encoding implementations.
//!
//! This module provides the core traits that define how characters are encoded:
//!
//! - [`Encoding`]: Base trait for all encodings
//! - [`LimitedEncoding`]: Marker for encodings that cannot represent all Unicode
//! - [`UniversalEncoding`]: Marker for encodings that can represent all Unicode
//!
//! # Example
//!
//! ```
//! use stringly::{Encoding, Utf8};
//!
//! // Check encoding properties
//! assert_eq!(Utf8::NAME, "UTF-8");
//! assert!(!Utf8::IS_FIXED_WIDTH);
//! assert_eq!(Utf8::MAX_CHAR_LEN, 4);
//!
//! // Validate bytes
//! assert!(Utf8::validate(b"hello").is_ok());
//! assert!(Utf8::validate(b"\xff\xfe").is_err());
//!
//! // Encode a character
//! let mut buf = [0u8; 4];
//! let len = Utf8::encode_char('a', &mut buf);
//! assert_eq!(&buf[..len], b"a");
//!
//! let len = Utf8::encode_char('\u{1F600}', &mut buf);
//! assert_eq!(len, 4); // Emoji takes 4 bytes in UTF-8
//! ```
use crate::error::EncodingError;
/// A trait defining a character encoding.
///
/// Implementors are zero-sized types (ZSTs) that serve as type-level markers
/// for the encoding of a string. All encoding operations are static methods.
pub trait Encoding: Sized + 'static {
/// The human-readable name of this encoding (e.g., "UTF-8", "UTF-16LE").
const NAME: &'static str;
/// Whether this encoding uses a fixed number of bytes per character.
const IS_FIXED_WIDTH: bool;
/// The number of bytes per character, if this is a fixed-width encoding.
/// Returns `None` for variable-width encodings.
const BYTES_PER_CHAR: Option<usize>;
/// The maximum number of bytes a single character can occupy in this encoding.
const MAX_CHAR_LEN: usize;
/// The number of bytes used to represent the null character ('\0') in this encoding.
///
/// For UTF-8 and single-byte encodings this is 1.
/// For UTF-16 this is 2 (the code unit 0x0000).
/// For UTF-32 this is 4 (the code unit 0x00000000).
const NULL_LEN: usize = 1;
/// Validates that the given byte slice is valid for this encoding.
///
/// Returns `Ok(())` if the bytes are valid, or an `EncodingError` indicating
/// where validation failed.
fn validate(bytes: &[u8]) -> Result<(), EncodingError>;
/// Decodes a character starting at the given byte offset.
///
/// Returns `Some((char, next_offset))` where `next_offset` is the byte index
/// immediately after the decoded character, or `None` if no valid character
/// starts at `offset`.
///
/// # Panics
///
/// May panic if `offset` is out of bounds or not on a character boundary.
fn decode_char_at(bytes: &[u8], offset: usize) -> Option<(char, usize)>;
/// Returns the number of bytes needed to encode the given character.
fn encoded_len(c: char) -> usize;
/// Encodes a character into the given buffer.
///
/// Returns the number of bytes written.
///
/// # Panics
///
/// Panics if the buffer is too small to hold the encoded character.
fn encode_char(c: char, buf: &mut [u8]) -> usize;
/// Checks if the given index is a valid character boundary in the byte slice.
///
/// For self-synchronizing encodings like UTF-8, this is O(1).
/// For non-self-synchronizing encodings, this may be O(n).
fn is_char_boundary(bytes: &[u8], index: usize) -> bool;
/// Decodes the character ending just before the given byte offset.
///
/// Returns `Some((char, start_offset))` where `start_offset` is the byte index
/// where the character starts, or `None` if no valid character ends at `offset`.
///
/// For non-self-synchronizing encodings, this may be O(n) as it needs to scan
/// from the beginning to find character boundaries.
fn decode_char_before(bytes: &[u8], offset: usize) -> Option<(char, usize)>;
/// Attempts to encode a character into the given buffer.
///
/// Returns `Some(bytes_written)` if the character can be represented in this
/// encoding, or `None` if the character cannot be encoded.
///
/// For encodings that can represent all Unicode characters (UTF-8, UTF-16, UTF-32),
/// this always returns `Some`. For limited encodings like codepages, this returns
/// `None` for characters outside the encoding's repertoire.
///
/// # Panics
///
/// Panics if the buffer is too small to hold the encoded character.
fn try_encode_char(c: char, buf: &mut [u8]) -> Option<usize> {
Some(Self::encode_char(c, buf))
}
/// Returns `true` if this encoding can represent the given character.
///
/// For encodings that can represent all Unicode characters (UTF-8, UTF-16, UTF-32),
/// this always returns `true`. For limited encodings like codepages, this returns
/// `false` for characters outside the encoding's repertoire.
fn can_encode(c: char) -> bool {
let _ = c;
true
}
}
/// Marker trait for encodings that cannot represent all Unicode code points.
///
/// Encodings implementing this trait may fail to encode certain Unicode characters.
/// Use `TryFrom` for conversions to these encodings.
pub trait LimitedEncoding: Encoding {}
/// Marker trait for encodings that can represent all Unicode code points.
///
/// These encodings (UTF-8, UTF-16, UTF-32, GB18030) can losslessly encode any
/// Unicode character, making conversions between them infallible.
pub trait UniversalEncoding: Encoding {}