1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
//! # Handle Converter
//!
//! `handle_converter` provides functions to detect confusable Unicode characters in a
//! given input string and return its canonical form.

use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
extern crate alloc;
use crate::types::HandleSuffix;
use alloc::{
	string::{String, ToString},
	vec::Vec,
};

// Will load `CONFUSABLES` with all the confusables at build time.
// See build.rs
include!(concat!(env!("OUT_DIR"), "/confusables.rs"));

/// Delimiter for handles
pub const HANDLE_DELIMITER: char = '.';

/// Creates a new `HandleConverter` instance with a built confusables map.
/// Converts a given string to its canonical form by stripping Unicode whitespace,
/// replacing confusable characters, and stripping diacritical marks.
/// The resulting string is converted to lowercase ASCII characters.
///
/// # Arguments
///
/// * `input_str` - The input string to convert to canonical form.
///
/// # Returns
///
/// A new string in canonical form.
///
pub fn convert_to_canonical(input_str: &str) -> alloc::string::String {
	let white_space_stripped = strip_unicode_whitespace(input_str);
	let diacriticals_stripped = strip_diacriticals(&white_space_stripped);
	let confusables_removed = replace_confusables(&diacriticals_stripped);
	confusables_removed.to_ascii_lowercase()
}

/// Replaces any characters in the input string that are confusable with a different character.
///
/// # Arguments
///
/// * `input_str` - A reference to the input string to replace confusable characters.
///
/// # Returns
///
/// A new string where any confusable characters have been replaced with their corresponding non-confusable characters.
pub fn replace_confusables(input_str: &str) -> alloc::string::String {
	input_str
		.chars()
		.map(|character| CONFUSABLES.get(&character).map_or(character, |&value| value))
		.collect::<alloc::string::String>()
}

/// This function removes diacritical marks from the input string and returns a new `String` without them.
///
/// # Arguments
/// * `input_str` - A string slice that contains the input string from which the diacritical marks need to be removed.
///
/// # Notes
/// This function uses the NFKD normalization form and filtering of combining marks to strip the diacritical marks from the input string.
/// Combining marks are Unicode characters that are intended to modify the appearance of another character.
///
pub fn strip_diacriticals(input_str: &str) -> alloc::string::String {
	input_str
		.nfkd()
		.filter(|character| !is_combining_mark(*character))
		.collect::<alloc::string::String>()
}

/// Splits the given display name into its base handle and handle suffix.
///
/// # Arguments
///
/// * `display_name_str` - The display name to split.
///
/// # Returns
///
/// A tuple containing the base handle string and the handle suffix as a `HandleSuffix` enum.
///
pub fn split_display_name(display_name_str: &str) -> Option<(String, HandleSuffix)> {
	let parts: Vec<&str> = display_name_str.split(HANDLE_DELIMITER).collect();
	let base_handle_str = parts[0].to_string();
	if parts.len() != 2 {
		return None;
	}

	let suffix = parts[1];
	let suffix_num = suffix.parse::<u16>().ok()?;

	Some((base_handle_str, suffix_num))
}

/// Strips any Unicode whitespace characters from the provided string and returns the resulting string.
///
/// # Arguments
///
/// * `input_str` - A string slice that holds the input string from which the Unicode whitespace characters need to be stripped.
///
/// # Returns
///
/// A new string without any Unicode whitespace characters.
pub fn strip_unicode_whitespace(input_str: &str) -> String {
	input_str
		.chars()
		.filter(|character| !character.is_whitespace())
		.collect::<alloc::string::String>()
}

/// Trims whitespace from the head and tail and collapses all other whitespace to just a single space
///
/// # Arguments
///
/// * `input_str` - A string slice that holds the input string from which the whitespace characters need to be trimmed and collapsed
///
/// # Returns
///
/// A new string without any Unicode whitespace characters.
pub fn trim_and_collapse_whitespace(input_str: &str) -> String {
	// Benchmarked as slightly faster than https://crates.io/crates/collapse
	input_str
		.split_whitespace()
		.filter(|s| !s.is_empty())
		.collect::<Vec<_>>()
		.join(" ")
}