Added stuff to WChar and String classes.

WChar can now decode utf16 and utf32, and encode utf8 and utf32. String now has functions append(), concat(), compare() and affect() with different prototypes for char[] arrays, so that we can use it as well with utf8, utf16 and utf32.
author: Alexis211 <alexis211@gmail.com> 2009-09-16 15:41:10 +0200
committer: Alexis211 <alexis211@gmail.com> 2009-09-16 15:41:10 +0200
commit: 5f88058644587aa255d453eee74c212e53cf9ade (patch)
tree: 664cadc06d75dde2618f2b3cde933f95ba898485 /Source/Kernel/Library
parent: e2e0f21d932224434cb6121165dc00f0c1bb3bdd (diff)
download: Melon-5f88058644587aa255d453eee74c212e53cf9ade.tar.gz
Melon-5f88058644587aa255d453eee74c212e53cf9ade.zip
4 files changed, 138 insertions, 46 deletions
diff --git a/Source/Kernel/Library/String.class.cpp b/Source/Kernel/Library/String.class.cpp
index 6380b25..9d4083b 100644
--- a/Source/Kernel/Library/String.class.cpp
+++ b/Source/Kernel/Library/String.class.cpp
@@ -54,8 +54,8 @@ String::String() {
 	m_length = 0;
 }
 
-String::String(const char* string) {
-	m_length = WChar::utfLen(string);
+String::String(const char* string, u8int encoding) {
+	m_length = WChar::utfLen(string, encoding);
 	if (m_length == 0) {
 		m_string = 0;
 		return;
@@ -63,7 +63,7 @@ String::String(const char* string) {
 	m_string = new WChar[m_length + 1];
 	int i = 0, l = strlen(string), c = 0;
 	while (i < l) {
-		i += m_string[c].affectUtf8(string + i);
+		i += m_string[c].affect(string + i, encoding);
 		c++;
 	}
 	m_string[m_length] = 0;
@@ -86,7 +86,7 @@ String::~String() {
 	if (m_string != 0) delete [] m_string;
 }
 
-void String::operator= (const String &other) {
+void String::affect (const String &other) {
 	m_length = other.m_length;
 	if (m_string != 0) delete [] m_string;
 	if (m_length == 0) {
@@ -100,8 +100,8 @@ void String::operator= (const String &other) {
 	m_string[m_length] = 0;
 }
 
-void String::operator= (const char* string) {
-	m_length = WChar::utfLen(string);
+void String::affect (const char* string, u8int encoding) {
+	m_length = WChar::utfLen(string, encoding);
 	if (m_string != 0) delete [] m_string;
 	if (m_length == 0) {
 		m_string = 0;
@@ -110,13 +110,13 @@ void String::operator= (const char* string) {
 	m_string = new WChar[m_length + 1];
 	int i = 0, l = strlen(string), c = 0;
 	while (i < l) {
-		i += m_string[c].affectUtf8(string + i);
+		i += m_string[c].affect(string + i, encoding);
 		c++;
 	}
 	m_string[m_length] = 0;
 }
 
-bool String::operator== (const String &other) const {
+bool String::compare (const String &other) const {
 	if (m_length != other.m_length) return false;
 	for (u32int i = 0; i < m_length; i++) {
 		if (m_string[i] != other.m_string[i]) return false;
@@ -124,19 +124,19 @@ bool String::operator== (const String &other) const {
 	return true;
 }
 
-bool String::operator== (const char* string) const {
-	if (m_length != WChar::utfLen(string)) return false;
+bool String::compare (const char* string, u8int encoding) const {
+	if (m_length != WChar::utfLen(string, encoding)) return false;
 	int i = 0, l = strlen(string), c = 0;
 	WChar tmp;
 	while (i < l) {
-		i += tmp.affectUtf8(string + i);
+		i += tmp.affect(string + i, encoding);
 		if (m_string[c] != tmp) return false;
 		c++;
 	}
 	return true;
 }
 
-String& String::operator+= (const String &other) {
+String& String::append (const String &other) {
 	WChar* newdata = new WChar[m_length + other.m_length + 1];
 	for (u32int i = 0; i < m_length; i++) {
 		newdata[i] = m_string[i];
@@ -151,14 +151,14 @@ String& String::operator+= (const String &other) {
 	return *this;
 }
 
-String& String::operator+= (const char* other) {
-	WChar* newdata = new WChar[m_length + WChar::utfLen(other) + 1];
+String& String::append (const char* other, u8int encoding) {
+	WChar* newdata = new WChar[m_length + WChar::utfLen(other, encoding) + 1];
 	for (u32int i = 0; i < m_length; i++) {
 		newdata[i] = m_string[i];
 	}
 	int i = 0, l = strlen(other), c = 0;
 	while (i < l) {
-		i += newdata[c + m_length].affectUtf8(other + i);
+		i += newdata[c + m_length].affect(other + i, encoding);
 		c++;
 	}
 	if (m_string != 0) delete [] m_string;
@@ -168,7 +168,7 @@ String& String::operator+= (const char* other) {
 	return *this;
 }
 
-String& String::operator+= (WChar other) {
+String& String::append (WChar other) {
 	WChar* newdata = new WChar[m_length + 2];
 	for (u32int i = 0; i < m_length; i++) {
 		newdata[i] = m_string[i];
@@ -181,17 +181,17 @@ String& String::operator+= (WChar other) {
 	return *this;
 }
 
-String& String::operator+ (const String &other) const {	//Can be optimized
+String String::concat (const String &other) const {	//Can be optimized
 	String ret(*this);
 	return (ret += other);
 }
 
-String& String::operator+ (const char* other) const { //Can be optimized
+String String::concat (const char* other, u8int encoding) const { //Can be optimized
 	String ret(*this);
-	return (ret += other);
+	return (ret.append(other, encoding));
 }
 
-String& String::operator+ (WChar other) const {
+String String::concat (WChar other) const {
 	String ret(*this);
 	return (ret += other);
 }
diff --git a/Source/Kernel/Library/String.class.h b/Source/Kernel/Library/String.class.h
index 01cc6a8..6a9de64 100644
--- a/Source/Kernel/Library/String.class.h
+++ b/Source/Kernel/Library/String.class.h
@@ -15,26 +15,40 @@ class String {
 	static String hex(u32int number);
 	static String number(s32int number);
 
-	String(const char* string);
+	String(const char* string, u8int encoding = UE_UTF8);
 	String();
 	String(const String &other);
 	~String();
 
-	void operator= (const String &other);
-	void operator= (const char* string);
-
-	bool operator== (const String &other) const;
-	bool operator== (const char* string) const;
-	bool operator!= (const String &other) { return !(operator== (other)); }
-	bool operator!= (const char* other) { return !(operator== (other)); }
-	String &operator+= (const String &other);
-	String &operator+= (const char* other);
-	String &operator+= (WChar other);
-	String &operator+ (const String &other) const;
-	String &operator+ (const char* other) const;
-	String &operator+ (WChar other) const;
-	s32int toInt() const;
-	u32int toInt16() const;	//From HEX
+	void affect(const String &other);
+	void affect(const char* string, u8int encoding = UE_UTF8);
+	void operator= (const String &other) { affect(other); }
+	void operator= (const char* other) { affect(other); }
+
+	bool compare(const String &other) const;
+	bool compare(const char* string, u8int encoding = UE_UTF8) const;
+	bool operator== (const String &other) const { return compare(other); }
+	bool operator== (const char* other) const { return compare(other); }
+	bool operator!= (const String &other) { return !compare(other); }
+	bool operator!= (const char* other) { return !compare(other); }
+
+	String& append(const String &other);
+	String& append(const char* other, u8int encoding = UE_UTF8);
+	String& append(WChar other);
+	String &operator+= (const String &other) { return append(other); }
+	String &operator+= (const char* other) { return append(other); }
+	String &operator+= (WChar other) { return append(other); }
+
+	String concat(const String &other) const;
+	String concat(const char* other, u8int encoding = UE_UTF8) const;
+	String concat(WChar other) const;
+	String operator+ (const String &other) const { return concat(other); }
+	String operator+ (const char* other) const { return concat(other); }
+	String operator+ (WChar other) const { return concat(other); }
+
+	s32int toInt() const; 	//Convert from DEC
+	u32int toInt16() const;	//Convert from HEX
+
 	WChar& operator[] (int index) const;
 
 	u32int size() const;
diff --git a/Source/Kernel/Library/WChar.class.cpp b/Source/Kernel/Library/WChar.class.cpp
index d7f01de..aad46c3 100644
--- a/Source/Kernel/Library/WChar.class.cpp
+++ b/Source/Kernel/Library/WChar.class.cpp
@@ -19,19 +19,28 @@ WChar::WChar(char c) {
 	affectAscii(c);
 }
 
-WChar::WChar(const char* c, u8int encoding) {	//TODO : take encoding into account
-	affectUtf8(c);
+WChar::WChar(const char* c, u8int encoding) {
+	if (encoding == UE_UTF8) 	affectUtf8(c);
+	if (encoding == UE_UTF16)	affectUtf16(c);
+	if (encoding == UE_UTF32)	affectUtf32(c);
 }
 
 u32int WChar::utfLen(const char* c, u8int encoding) {
 	int i = 0, l = CMem::strlen(c), co = 0;
 	while (i < l) {
-		if ((c[i] & 0x80) == 0) i += 1;
-		else if ((c[i] & 0xE0) == 0xC0) i += 2;
-		else if ((c[i] & 0xF0) == 0xE0) i += 3;
-		else if ((c[i] & 0xF8) == 0xF0) i += 4;
-		else i += 1;
-		co++;
+		if (encoding == UE_UTF8) {
+			if ((c[i] & 0x80) == 0) i += 1;
+			else if ((c[i] & 0xE0) == 0xC0) i += 2;
+			else if ((c[i] & 0xF0) == 0xE0) i += 3;
+			else if ((c[i] & 0xF8) == 0xF0) i += 4;
+			else i += 1;
+			co++;
+		} else if (encoding == UE_UTF16) {
+			if ((c[i] & 0xFC) == 0xD8 and (c[i + 2] & 0xFC) == 0xDC) i += 4;
+			else i += 2;
+		} else if (encoding == UE_UTF32) {
+			i += 4;
+		}
 	}
 	return co;
 }
@@ -67,6 +76,31 @@ u32int WChar::affectUtf8(const char* c) {	//Returns the number of bytes for the
 	return 1;
 }
 
+u32int WChar::affectUtf16(const char* c) {
+	if ((c[0] & 0xFC) == 0xD8 and		// 11111100b, 11011000b
+		(c[2] & 0xFC) == 0xDC) {		// 11111100b, 11011100b
+		u32int w = ((c[0] & 0x03) << 2) | ((c[1] & 0xC0) >> 6);
+		u32int x = (c[1] & 0x3F);
+		u32int y = ((c[2] & 0x03) << 8) | (c[2]);
+		value = ((w + 1) << 16) | (x << 10) | y;
+		if (value >= 0xD800 and value <= 0xDFFF) value = 0;	//These values are unallowed
+		if (value >= 0xFFFE and value <= 0xFFFF) value = 0;	
+		return 4;
+	} else {
+		value = (c[0] << 8) | (c[1]);
+		if (value >= 0xD800 and value <= 0xDFFF) value = 0;	//These values are unallowed
+		if (value >= 0xFFFE and value <= 0xFFFF) value = 0;	
+		return 2;
+	}
+}
+
+u32int WChar::affectUtf32(const char* c) {
+	value = (c[0] << 24) | (c[1] << 16) | (c[2] << 8) | c[3];
+	if (value >= 0xD800 and value <= 0xDFFF) value = 0;	//These values are unallowed
+	if (value >= 0xFFFE and value <= 0xFFFF) value = 0;	
+	return 4;
+}
+
 u8int WChar::toAscii() {
 	if (value < 128) return (char)value;
 	for (int i = 0; i < 128; i++) {
@@ -74,3 +108,35 @@ u8int WChar::toAscii() {
 	}
 	return '?';
 }
+
+uchar_repr_t WChar::toUtf8() {
+	uchar_repr_t r;
+	r.i = 0;
+	if (value < 128) {
+		r.c[0] = value;
+	} else if (value < 4096) {
+		r.c[0] = 0xC0 | ((value & 0x07C0) >> 6);
+		r.c[1] = 0x80 | (value & 0x3F);
+	} else if (value < 65536) {
+		r.c[0] = 0xE0 | ((value & 0xF000) >> 12);
+		r.c[1] = 0x80 | ((value & 0x0FC0) >> 6);
+		r.c[2] = 0x80 | (value & 0x003F);
+	} else {
+		r.c[0] = 0xF0 | ((value & 0x1C0000) >> 18);
+		r.c[1] = 0x80 | ((value & 0x3F000) >> 12);
+		r.c[2] = 0x80 | ((value & 0x0FC0) >> 6);
+		r.c[3] = 0x80 | (value & 0x003F);
+	}
+	return r;
+}
+
+//TODO : code WChar::toUtf16
+
+uchar_repr_t WChar::toUtf32() {
+	uchar_repr_t r;
+	r.c[0] = (value >> 24) & 0xFF;
+	r.c[1] = (value >> 16) & 0xFF;
+	r.c[2] = (value >> 8) & 0xFF;
+	r.c[3] = value & 0xFF;
+	return r;
+}
diff --git a/Source/Kernel/Library/WChar.class.h b/Source/Kernel/Library/WChar.class.h
index fc00577..63f1ea3 100644
--- a/Source/Kernel/Library/WChar.class.h
+++ b/Source/Kernel/Library/WChar.class.h
@@ -26,11 +26,23 @@ struct WChar {
 
 	void affectAscii(char c);
 	u32int affectUtf8(const char* c);
-	void affectUtf16(const char* c);
-	void affectUtf32(const char* c);
+	u32int affectUtf16(const char* c);
+	u32int affectUtf32(const char* c);
+
+	u32int affect(const char* c, u8int encoding = UE_UTF8) {
+		if (encoding == UE_UTF8) return affectUtf8(c);
+		if (encoding == UE_UTF16) return affectUtf16(c);
+		if (encoding == UE_UTF32) return affectUtf32(c);
+		affectAscii(c[0]);	//Default case :/
+		return 1;
+	}
 
 	u8int toAscii();
 
+	uchar_repr_t toUtf8();
+	uchar_repr_t toUtf16();
+	uchar_repr_t toUtf32();
+
 	inline WChar operator+ (u32int other) {
 		WChar r;
 		r.value = value + other;
author	Alexis211 <alexis211@gmail.com>	2009-09-16 15:41:10 +0200
committer	Alexis211 <alexis211@gmail.com>	2009-09-16 15:41:10 +0200
commit	5f88058644587aa255d453eee74c212e53cf9ade (patch)
tree	664cadc06d75dde2618f2b3cde933f95ba898485 /Source/Kernel/Library
parent	e2e0f21d932224434cb6121165dc00f0c1bb3bdd (diff)
download	Melon-5f88058644587aa255d453eee74c212e53cf9ade.tar.gz Melon-5f88058644587aa255d453eee74c212e53cf9ade.zip