Is this String class properly implemented?

kevints...@gmail.com

unread,

Apr 18, 2009, 7:31:10 AM4/18/09

to

I am new to the C++ language, I implement this String class just for
practising.
I want to know what to improve to make it *qualified* to be used in
real programs, or what code should be changed to make it more
efficient...yea, I know you will recommend me to using std::string,
(-:just practising...

Here's the code:
#ifndef STRING_H_
#define STRING_H_
#include "Core.h"

class String{
private:
wchar_t* value;
int size;
public:
String();
String(const wchar_t* src);
String(const wchar_t* src, int offset, int count);
String(String& s);
String(const int& num);
String(const long& l);
String(const float& f);
String(const double& d);
String(const wchar_t& c);
operator const wchar_t* () const {
return value;
}
~String(){
size = 0;
delete [] value;
}
int length() const;
const wchar_t& wchar_tAt(int index) const;
//the length of the substring is endIndex-beginIndex
String substring(int beginIndex, int endIndex) const;
String substring(int beginIndex) const;
int indexOf(const wchar_t& c) const;
int indexOf(const wchar_t& c, int fromIndex) const;
int indexOf(const String& s) const;
int indexOf(const String& s, int fromIndex) const;
int indexOf(const wchar_t* s) const;
int indexOf(const wchar_t* s, int fromIndex) const;

int lastIndexOf(wchar_t c) const;
int lastIndexOf(wchar_t c, int fromIndex) const;
int lastIndexOf(const String& s) const;
int lastIndexOf(const String& s, int fromIndex) const;
int lastIndexOf(const wchar_t* s) const;
int lastIndexOf(const wchar_t* s, int fromIndex) const;
/*
//these two functions cannot be properly implemented using plain
pointers, an Array class is required?
String** split(const wchar_t& c) const;
String** split(const String& s) const;
*/
String toLowerCase() const;
String toUpperCase() const;
bool isEmpty() const;
bool startsWith(const String* s) const;
bool endsWith(const String* s) const;
bool startsWith(const wchar_t* s) const;
bool endsWith(const wchar_t* s) const;
bool equals(const String* s) const;
bool equals(const wchar_t* s) const;
String trim();
String trimLeft();
String trimRight();
const wchar_t* toCString() const;
String& operator+=(const String& str);
String& operator+=(const wchar_t* str);
String operator=(String& str);
String operator=(const wchar_t* str);
String operator+(const wchar_t* str) const;
bool operator==(const String* s) const;
};

String operator+(const String& str1, const String& str2);
String operator+(const wchar_t* str1, const String& str2);
#endif

#include "String.h"

String::String(){
size = 0;
value = new wchar_t[1]; //one wchar_t to store the terminating null
value[0] = '\0';
}

String::String(const int& num) {
if(num == minInteger)
String(L"-2147483648");
else{
int lenOfNumeric = num < 0 ? ::getLengthOfNumeric<int>(-num) +
1 : ::getLengthOfNumeric<int>(num);
value = new wchar_t[lenOfNumeric + 1];
::getChars(num, value, lenOfNumeric);
value[lenOfNumeric] = '\0';
size = lenOfNumeric;
}
}

String::String(const long& num) {
if(num == minInteger)
String(L"-2147483648");
else{
int lenOfNumeric = num < 0 ? ::getLengthOfNumeric<long>(-num) +
1 : ::getLengthOfNumeric<long>(num);
value = new wchar_t[lenOfNumeric + 1];
::getChars(num, value, lenOfNumeric);
value[lenOfNumeric] = '\0';
size = lenOfNumeric;
}
}

//will fail if str is null
String::String(const wchar_t* str){
size = wcslen(str);
value = new wchar_t[size + 1];
if(size > 0)
wcscpy(value, str);
else
value[0] = '\0';
}

//copy constructor
String::String(String& s){
size = s.length();
if(size == 0){
value = new wchar_t[1];
value[0] = '\0';
}else{
value = new wchar_t[size + 1];
wcscpy(value, s.value);
}
}

//will fail if str is null
String::String(const wchar_t* str, int offset, int count){
size = 0;
int len = wcslen(str);
if(len == 0 || offset >= len){
//in this case, create an empty String
value = new wchar_t[1];
value ='\0';
}else{
if(offset < 0) offset = 0;
if(count > len - offset) count = len - offset;
value = new wchar_t[count + 1];
int index = 0;
int endIndex = offset + count;
for(int i = offset; i < endIndex; ++i){
value[index] = str[i];
++index;
}
value[index] = '\0';
//only in this case will the size of the String be changed
size = count;
}
}

int String::length() const {
return size;
}

const wchar_t& String::wchar_tAt(int index) const{
if(size == 0 || index < 0 || index >= size) throw
StringIndexOutOfBoundsException(new String(L"out of bounds."));
return value[index];
}

String String::substring(int beginIndex, int endIndex) const {
if(size == 0 || beginIndex < 0 || beginIndex >= size) throw
StringIndexOutOfBoundsException(new String(L"out of bounds."));
if(endIndex > size) throw StringIndexOutOfBoundsException(new String
(L"out of bounds."));
if(beginIndex > endIndex) throw StringIndexOutOfBoundsException(new
String(L"out of bounds."));

wchar_t* temp = new wchar_t[endIndex - beginIndex + 1];
int index = 0;
for(int i = beginIndex; i < endIndex; ++i){
temp[index] = value[i];
++index;
}
temp[index] = '\0';
String s(temp);
delete [] temp;
return s;
}

String String::substring(int beginIndex) const{
if(size == 0 || beginIndex < 0 || beginIndex >= size) throw
StringIndexOutOfBoundsException(new String(L"out of bounds."));
return substring(beginIndex, size);
}

int String::indexOf(const wchar_t& c) const {
return indexOf(c, 0);
}

int String::indexOf(const wchar_t& c, int fromIndex) const {
if(fromIndex < 0) fromIndex = 0;
else if(fromIndex >= size) return -1;
for(int i = fromIndex; i < size; ++i){
if(value[i] == c)
return i;
}
return -1;
}

int String::indexOf(const String& s) const {
return indexOf(s.toCString());
}

int String::indexOf(const String& s, int fromIndex) const {
return indexOf(s.toCString(), fromIndex);
}

int String::indexOf(const wchar_t* s) const {
return indexOf(s, 0);
}

int String::indexOf(const wchar_t* s, int fromIndex) const {
if(!s || size == 0) return -1;
if(fromIndex < 0) fromIndex = 0;
else if(fromIndex >= size) return -1;
int len = wcslen(s);
if(len == 0) return -1;
if(len + fromIndex > size) return -1;
int countMatched = 0;
int firstFoundIndex = 0;
for(int i = fromIndex; i < size; ++i){
firstFoundIndex = i;
countMatched = 0;
if(value[i] == s[countMatched]){//found first letter
do{
++countMatched;
if(countMatched == len) return i; //if all chars in "s" are found
in a row, then the search is a success, return the index
++firstFoundIndex;
}while(firstFoundIndex < size && value[firstFoundIndex] == s
[countMatched]); //ensures that the loop does not step over bounds
}
}
return -1;
}

int String::lastIndexOf(wchar_t c) const {
return lastIndexOf(c, size - 1);
}
int String::lastIndexOf(wchar_t c, int fromIndex) const {
if(size == 0 || fromIndex < 0) return -1;
if(fromIndex >= size) fromIndex = size - 1;
for(int i = fromIndex; i >= 0; --i){
if(value[i] == c)
return i;
}
return -1;
}

int String::lastIndexOf(const String& s) const {
return lastIndexOf(s.toCString(), size - 1);
}
int String::lastIndexOf(const String& s, int fromIndex) const {
return lastIndexOf(s.toCString(), fromIndex);
}

int String::lastIndexOf(const wchar_t* s) const {
return lastIndexOf(s, size - 1);
}

int String::lastIndexOf(const wchar_t* s, int fromIndex) const {
if(!s || size == 0) return -1;
if(fromIndex < 0) return -1;
else if(fromIndex >= size) fromIndex = size - 1;
int len = wcslen(s);
if(len == 0) return -1;
int countMatched = 0;
int firstFoundIndex = 0;
for(int i = fromIndex; i >= 0; --i){
firstFoundIndex = i;
countMatched = 0;
if(value[i] == s[countMatched]){//found first letter
do{
++countMatched;
if(countMatched == len) return i; //if all chars in "s" are found
in a row, then the search is a success, return the index
++firstFoundIndex;
}while(firstFoundIndex < size && value[firstFoundIndex] == s
[countMatched]); //ensures that the loop does not step over bounds
}
}
return -1;
}

String String::toLowerCase() const {
if(size == 0) return String();
wchar_t* temp = new wchar_t[size + 1];
for(int i = 0; i < size; ++i){
if(value[i] >= 65 && value[i] <= 90)
temp[i] = value[i] | 0x20; //convert to lower case
else
temp[i] = value[i];
}
temp[size] = '\0';
String s(temp);
delete [] temp;
return s;
}
String String::toUpperCase() const {
if(size == 0) return String();
wchar_t* temp = new wchar_t[size + 1];
for(int i = 0; i < size; ++i){
if(value[i] >= 97 && value[i] <= 122)
temp[i] = value[i] & 0x5F; //convert to upper case
else
temp[i] = value[i];
}
temp[size] = '\0';
String s(temp);
delete [] temp;
return s;
}

bool String::isEmpty() const {
if(size > 0) return false;
return true;
}

String String::trimLeft() {
if(size == 0) return *this;
int beginIndex = 0;
if(value[beginIndex] == ' '){ //if the String starts with a space
while(++beginIndex < size)
if(value[beginIndex] != ' ')
return substring(beginIndex, size);
}
return *this;
}

String String::trimRight() {
if(size == 0) return *this;
int endIndex = size - 1;
if(value[endIndex] == ' '){ //if the String ends with a space,
which precedes the terminating '\0'
while(--endIndex >= 0)
if(value[endIndex] != ' ')
return substring(0, ++endIndex); //++endIndex is a must, since
length of the substring is endIndex-beginIndexx
}
return *this;
}

String String::trim(){
if(size == 0) return *this;
int beginIndex = 0;
int endIndex = size - 1;

if(value[beginIndex] == ' '){ //if the String starts with a space
while(++beginIndex < size)
if(value[beginIndex] != ' ')
break;
}
if(value[endIndex] == ' '){ //if the String ends with a space,
which precedes the terminating '\0'
while(--endIndex >= 0)
if(value[endIndex] != ' '){
++endIndex;
break;
}
}
if(beginIndex != 0 || endIndex != size - 1)
return substring(beginIndex, endIndex);
return *this;
}

bool String::startsWith(const String* s) const {
if(!s) return false;
if(this == s || (size == 0 && s->length() == 0)) return true;
if(size < s->length()) return false;
for(int i = 0; i < s->length(); ++i){
if(value[i] != s->value[i])
return false;
}
return true;
}

bool String::startsWith(const wchar_t* s) const {
if(!s) return false;
int len = wcslen(s);
if(size == 0 && len == 0) return true;
if(size < len) return false;
for(int i = 0; i < len; ++i){
if(value[i] != s[i])
return false;
}
return true;
}

bool String::endsWith(const String* s) const {
if(!s) return false;
if(this == s || (size == 0 && s->length() == 0)) return true;
if(size < s->length()) return false;
int oriStrIndex = size;
for(int i = s->length() - 1; i >= 0; --i){
if(value[--oriStrIndex] != s->value[i])
return false;
}
return true;
}

bool String::endsWith(const wchar_t* s) const {
if(!s) return false;
int len = wcslen(s);
if(size == 0 && len == 0) return true;
if(size < len) return false;
int oriStrIndex = size;
for(int i = len - 1; i >= 0; --i){
if(value[--oriStrIndex] != s[i])
return false;
}
return true;
}

bool String::equals(const wchar_t* s) const {
if(!s) return false;
int len = wcslen(s);
if(size == 0 && len == 0) return true;
if(size != len) return false;
for(int i = 0; i < size; ++i){
if(value[i] != s[i]) return false;
}
return true;
}

bool String::equals(const String* s) const {
if(!s) return false;
if(this == s || (size == 0 && s->length() == 0)) return true;
if(size != s->length()) return false;
for(int i = 0; i < size; ++i){
if(value[i] != s->value[i]) return false;
}
return true;
}

/*

static String valueOf(long l);
static String valueOf(float f);
static String valueOf(double d);
static String valueOf(wchar_t c);
*/

String operator +(const String& str1, const String& str2){
if(str1.length() == 0 && str2.length()) return String();
wchar_t* temp = new wchar_t[str1.length() + str2.length() + 1];
temp[0] = '\0';
if(str1.length() > 0)
wcscpy(temp, str1.toCString());
if(str2.length() > 0)
wcscat(temp, str2.toCString());
String s(temp);
delete [] temp;
return s;
}

String String::operator +(const wchar_t* str) const{
if(!str) return String(value);
int len = wcslen(str);
if(len == 0) return String(value);
wchar_t* temp = new wchar_t[size + len + 1];
temp[0] = '\0';
wcscpy(temp, value);
wcscat(temp, str);
String s(temp);
delete [] temp;
return s;
}

String operator+(const wchar_t* str1, const String& str2) {
int size = 0;
if(!str1) size = wcslen(str1);
wchar_t* temp = new wchar_t[size + str2.length() + 1];
temp[0] = '\0';
if(size > 0)
wcscpy(temp, str1);
if(str2.length() > 0)
wcscat(temp, str2.toCString());
String s(temp);
delete [] temp;
return s;
}

String& String::operator +=(const String& str) {
if(str.length() == 0) return *this;
size += str.length();
wchar_t* temp = new wchar_t[size + 1];
temp[0] = '\0';
wcscpy(temp, value);
wcscat(temp, str.toCString());
delete [] value;
value = temp;
return *this;
}

String& String::operator +=(const wchar_t* str) {
if(!str) return *this;
int len = wcslen(str);
if(len == 0) return *this;
size += len;
wchar_t* temp = new wchar_t[size + 1];
temp[0] = '\0';
wcscpy(temp, value);
wcscat(temp, str);
delete [] value;
value = temp;
return *this;
}

String String::operator =(String& str) {
return str;
}
String String::operator =(const wchar_t* str) {
if(!str) return String();
return String(str);
}

bool String::operator ==(const String* s) const{
return this == s;
}
//ensures that the returned C-style string cannot be changed
const wchar_t* String::toCString() const {
return value;
}

Obnoxious User

unread,

Apr 18, 2009, 7:52:30 AM4/18/09

to

On Sat, 18 Apr 2009 04:31:10 -0700, kevintse.onjee wrote:

> I am new to the C++ language, I implement this String class just for
> practising.
> I want to know what to improve to make it *qualified* to be used in real
> programs,

Depends on what you define as "real programs" and what your demands
are on a string framework. But this...

is completely useless from my perspective. How about yours?

String(L"åäö").toUpperCase(); // <-!?

--
OU
http://www.bevarainternet.se/

SG

unread,

Apr 18, 2009, 8:44:46 AM4/18/09

to

On 18 Apr., 13:31, kevintse.on...@gmail.com wrote:
> I am new to the C++ language, I implement this String class just for
> practising.

Ok.

> class String{
> private:
> wchar_t* value;
> int size;

The typical approach would be to use two integers (of some type like
std::size_t). The 2nd integer -- usually called "capacity" -- stores
the size of the array whereas "size" (the logical length of the
string) may be smaller. I didn't check your implementation for
operator+= but you usually don't want to allocate a new character
array every time you add a new character to the string.

You also may want to throw out some member functions and replace them
with free functions if appropriate. std::string is an example of a
"bloated super class".

> public:
> String(String& s);

Your copy constructor doesn't take a referece-to-const?

> operator const wchar_t* () const {
> return value;
> }

This is dangerous since this allows *implicit* conversion to a pointer
that points to memory that is *managed* by the string object. If the
string object dies and deletes the char array you basically have an
invalid pointer. Example:

String foo();

void bar() {
wchar_t * invalid = foo();
// accessing "invalid" invokes undefined behaviour
}

This is why std::string has a c_str() member function for this as
opposed to an implicit conversion operator.

Also: Consider making some constructors explicit. Check your textbook
on explicit constructors and conversion.

> ~String(){
> size = 0;
> delete [] value;
> }

"size = 0;" is pretty much useless here.

> //these two functions cannot be properly implemented using

> // plain pointers, an Array class is required?

> String** split(const wchar_t& c) const;
> String** split(const String& s) const;

It's not clear from the signature and the functions' names what it is
exactly the functions do and return. Do you intent do create String
objects dynamically as well as an array of String pointers? Please
don't. Instead, write a *free* function (not a member function) that
does the splitting. This free function could return a vector<String>
object or it could be a function template that takes an "output
iterator" like this:

template<typename OutIter>
void split(String const& str, wchar_t at, OutIter oiter) {
...
*oiter = some_substring;
++oiter;
...
}

which allows you to write

void foo(String const& x) {
vector<String> blah;
split(x,'.',back_inserter(blah));
}

See documentation for the standard header file <iterator>.
( http://www.cplusplus.com/reference/std/iterator/ )

> String trim();
> String trimLeft();
> String trimRight();

...are just three examples that could have been free functions
instead. Also, since you seem to be creating a new String object for
the result as return value you seem to have forgotten 'const'
qualifiers.

> String operator=(String& str);

Your copy assignment doesn't take a reference-to-const String?

> #include "String.h"
>
> String::String(){
> size = 0;
> value = new wchar_t[1]; //one wchar_t to store the terminating null
> value[0] = '\0';
> }

You might want to delay the allocation until you need to return a
wchar_t pointer to a zero terminated string. This should make
creating empty strings faster.

[snipped a whole lot of code I'm too lazy to go through]

Cheers!
SG

kevints...@gmail.com

unread,

Apr 18, 2009, 11:22:50 AM4/18/09

to

Oh, I have clicked the wrong link, and only replied to the author SG...

SG

unread,

Apr 18, 2009, 11:46:59 AM4/18/09

to

On 18 Apr., 17:22, kevintse.on...@gmail.com wrote:
> Oh, I have clicked the wrong link, and only replied to the author SG

Let me include it here.

On Sat, Apr 18, 2009 at 5:18 PM, kevintse.on...@gmail.com wrote:
> On Apr 18, 8:44 pm, SG <s.gesem...@gmail.com> wrote:
>> kevintse.on...@gmail.com wrote:
>> > [...]
>> [...]
> Actually, I intended to write an *unmutable* String, which only had
> logical length, its length was not supposed to be changeable. This is
> also the Java implementation of the String class. huh, I took the idea
> from Java.

OK.

>> > operator const wchar_t* () const {
>> > return value;
>> > }
>>
>> This is dangerous since this allows *implicit* conversion to a pointer
>> that points to memory that is *managed* by the string object. If the
>> string object dies and deletes the char array you basically have an
>> invalid pointer.
>

> So if I want to return a C style string, I have to allocate new memory
> for the string? and let the caller of the function deal with the
> deallocation of the memory? But, most of the time, that I call this
> function is just to obtain a C style string that can be output with
> "iostream" related functions.

I haven't said anything against returning your internal wchar_t*
converted to pointer-to-const. I reasoned against an implicit
conversion operator.

>> > ~String(){
>> > size = 0;
>> > delete [] value;
>> > }
>>
>> "size = 0;" is pretty much useless here.
>

> Set size to zero is necessary, because I return "size" directly when
> the "length()" function is called, this function tells the actual size
> of the String. And this is definitely more efficient than calling
> "wcslen(value)" everytime we need a "size".

You're not entirely familiar with the concept of destructors, are you?

>> > String trim();
>> > String trimLeft();
>> > String trimRight();
>>
>> ...are just three examples that could have been free functions
>> instead. Also, since you seem to be creating a new String object for
>> the result as return value you seem to have forgotten 'const'
>> qualifiers.
>

> I was refering to the Java implementation of the class in which these
> functions return new Strings, so, you know, I copied... oh, Java does
> not have a pointer, or a reference...

I know that. But that's beside the point.

> OK, these functions are all supposed to return references for better
> efficiency, cause I am using C++.

...and what kind of reference would that be? A reference to the
object itself? Do you want these functions to mutate the String
object or just return a modified copy?

Never ever return a reference to a non-static function-local object.
Never ever.

>> > String operator=(String& str);
>>
>> Your copy assignment doesn't take a reference-to-const String?
>

> Oh, I have forgotten that...

Cheers!
SG

SG

unread,

Apr 18, 2009, 12:01:19 PM4/18/09

to

On 18 Apr., 17:22, kevintse.on...@gmail.com wrote:
> Actually, I intended to write an *unmutable* String, which only had
> logical length, its length was not supposed to be changeable. This is
> also the Java implementation of the String class. huh, I took the idea
> from Java.

You can do that, of course. But when you write

String foo = "hello";
String bar = foo;

in JAVA you're only copying a reference and not the object. So, in
order to emulate this in C++ you would use a reference-counted
character array that can be shared among multiple instances of your
String class.

Otherwise every String object would manage its own character array and
there would be no point in preventing any mutations.

Cheers!
SG

kevints...@gmail.com

unread,

Apr 18, 2009, 12:36:56 PM4/18/09

to

>On Apr 18, 11:46 pm, SG <s.gesem...@gmail.com> wrote:
> >> > operator const wchar_t* () const {
> >> > return value;
> >> > }
>
> >> This is dangerous since this allows *implicit* conversion to a pointer
> >> that points to memory that is *managed* by the string object. If the
> >> string object dies and deletes the char array you basically have an
> >> invalid pointer.
>
> > So if I want to return a C style string, I have to allocate new memory
> > for the string? and let the caller of the function deal with the
> > deallocation of the memory? But, most of the time, that I call this
> > function is just to obtain a C style string that can be output with
> > "iostream" related functions.
>
> I haven't said anything against returning your internal wchar_t*
> converted to pointer-to-const. I reasoned against an implicit
> conversion operator.

std::c_str() does return a pointer to its internal char array, too. I
still don't see the difference between using a function and implicit
conversion to return the internal wchar_t*, both methods are
*dangerous*, aren't they?
Actually, I have another function "toCString()" which does the same
thing as the implicit conversion, but it was weird enough that when I
tried to test this String class with "std::wcout", "std::wcout <<
myString" output numbers(as we know, wchar_t is unsigned short), while
"std::wcout << myString.toCString()" output exactly what I put in the
myString object...

> >> > ~String(){
> >> > size = 0;
> >> > delete [] value;
> >> > }
>
> >> "size = 0;" is pretty much useless here.
>
> > Set size to zero is necessary, because I return "size" directly when
> > the "length()" function is called, this function tells the actual size
> > of the String. And this is definitely more efficient than calling
> > "wcslen(value)" everytime we need a "size".
>
> You're not entirely familiar with the concept of destructors, are you?

The String itself is already out of scope when the destructor is
called automatically, and there's no any chance the "length()"
function will be called, right?

> >> > String trim();
> >> > String trimLeft();
> >> > String trimRight();
>
> >> ...are just three examples that could have been free functions
> >> instead. Also, since you seem to be creating a new String object for
> >> the result as return value you seem to have forgotten 'const'
> >> qualifiers.
>
> > I was refering to the Java implementation of the class in which these
> > functions return new Strings, so, you know, I copied... oh, Java does
> > not have a pointer, or a reference...
>
> I know that. But that's beside the point.
>
> > OK, these functions are all supposed to return references for better
> > efficiency, cause I am using C++.
>
> ...and what kind of reference would that be? A reference to the
> object itself? Do you want these functions to mutate the String
> object or just return a modified copy?
>
> Never ever return a reference to a non-static function-local object.
> Never ever.

what is "non-static function-local object"?

> Cheers!
> SG

Kevin
Regards

Daniel T.

unread,

Apr 18, 2009, 12:39:41 PM4/18/09

to

kevints...@gmail.com wrote:

> I am new to the C++ language, I implement this String class just for
> practising.
> I want to know what to improve to make it *qualified* to be used in
> real programs, or what code should be changed to make it more
> efficient...yea, I know you will recommend me to using std::string,
> (-:just practising...

No problem. Where I work, we don't use the STL (for mainly historical
reasons) and it is part of my job to maintain several different string
classes.

Here are some comments about the interface/implementation you chose:

In the functions: String::String(const int&) and String::String(const
long&), you are putting the null terminator at value[size] but in other
functions you are putting the null terminator at value[size + 1].

Also, those two functions are identical. They shouldn't be identical,
but even so, if properly implemented they would be almost identical.
Consider breaking out the parts that are alike into a separate function.

Your String::String(String& s) copy constructor is missing the 'const',
which limits where/how it can be used.

For String::String(const wchar_t* str, int offset, int count), you
decided not to handle the abnormal case of str == 0, but you handled the
case where offset < 0, and the case where count > wcslen(str) - offset.
Why handle some but not all?

For const wchar_t& String::wchar_tAt(int index) const... Doing something
within a throw that might cause a throw (in this case, allocating
memory) is bad form.

String String::substring(int beginIndex, int endIndex) const... in the
last constructor I mentioned above, you corrected offset and count when
they were bad values, but here you throw if beginIndex and endIndex are
bad values. Consistency is important.
Also, you have two 'new's in this function, (one outright, and the
other within the constructor of the 's' object. I suggest you look for
ways to remove one of them. You will probably need to add functionality
to the String class (a new member-function or two,) then you will have
to decide if these new methods should be public or private...

You have an "operator const wchar_t* () const" and a "const wchar_t*
toCString() const" that both do the same thing. Others have already
suggested you remove the former. Generally, having two member-functions
that do the exact same thing is unnecessary duplication (though it
sometimes makes sense in the face of inheritance, but even then one of
the functions should be implemented in terms of the other.)

Your trim functions should all be const.

If you are going to consistently put a null terminator on the end of
your 'value' array, you don't really need the 'size' variable at all;
it's a rather minor optimization that is more than overwhelmed by the
fact that your class has to reallocate the value array every time
someone increases the length of the string.

Now that I have said the above about your current implementation... I'm
not sure that this is an appropriate implementation. There are a lot of
different ways to implement a string class and different implementations
are better for different situations. For example:

Your string class is very inefficient whenever the length() of the
object grows yet you provide member-functions that grow the string (op+=
for example,) maybe you should either remove those functions or change
the implementation to make them more efficient.

If you choose to remove the op+=, then you should consider also removing
the op+ as well as the global op+ functions. (I consider it bad form to
provide a op+ but not a op+=.)

That said, your class is largely immutable, the sole mutators are the
two op+= and the two op= functions. If you went all the way and removed
all four of them, you could make your class even more time efficient by
using a data sharing implementation. If you choose to keep these four
functions, then maybe you should consider including more functions for
mutation and go with an implementation that is more efficient in the
face of mutation.

Lastly, you have to consider usage patterns. If 80% of your strings are
less than some limit (say 16 characters,) then you might want to
consider optimizing the class so it doesn't have to allocate any heap
memory in those cases.

There are several different ways to implement a "string class" and each
method presents different time and space usage measurements. The best
implementation depends on issues that simply cannot be addressed within
the string class itself. The string class provided in the standard has
fair tradeoffs that make it good for general use, but not the best
choice in all cases. If you have a program that makes heavy use of
strings, you might be better off providing your own implementation that
has different tradeoffs or even multiple string classes within the same
program.

SG

unread,

Apr 18, 2009, 1:04:09 PM4/18/09

to

On 18 Apr., 18:36, kevintse.on...@gmail.com wrote:

> SG wrote:
> > I haven't said anything against returning your internal wchar_t*
> > converted to pointer-to-const. I reasoned against an implicit
> > conversion operator.
>
> std::c_str() does return a pointer to its internal char array, too. I
> still don't see the difference between using a function and implicit
> conversion to return the internal wchar_t*, both methods are
> *dangerous*, aren't they?

Yes, but one more than the other because the implicit conversion can
be triggered unintentionally by the programmer. In your case you rely
on the fact that the original String objects lives long enough. So,
it's not really "converted" into some other form of representation.

> > Never ever return a reference to a non-static function-local object.
> > Never ever.
>
> what is "non-static function-local object"?

Don't be lazy. Search the web or look into your favorite C++ textbook.

Cheers!
SG

Alf P. Steinbach

unread,

Apr 18, 2009, 1:19:32 PM4/18/09

to

* Daniel T.:

>
> That said, your class is largely immutable, the sole mutators are the
> two op+= and the two op= functions. If you went all the way and removed
> all four of them, you could make your class even more time efficient by
> using a data sharing implementation. If you choose to keep these four
> functions, then maybe you should consider including more functions for
> mutation and go with an implementation that is more efficient in the
> face of mutation.

Assignment/concatenation works well with immutable strings (don't know about the
OP's code, though).

It's much of the point.

Cheers & hth.,

- Alf

--
Due to hosting requirements I need visits to <url: http://alfps.izfree.com/>.
No ads, and there is some C++ stuff! :-) Just going there is good. Linking
to it is even better! Thanks in advance!

SG

unread,

Apr 18, 2009, 1:46:12 PM4/18/09

to

On 18 Apr., 19:19, "Alf P. Steinbach" <al...@start.no> wrote:
>
> Assignment/concatenation works well with immutable strings

IMHO, "immutable strings" can be misunderstood. I think by "strings"
you refer to the internal char array representation as opposed to a
user-defined string class object that manages this char array. I
would still want to mutate those string objects in the same way I can
change the value of some int variable. :)

Cheers!
SG

Daniel T.

unread,

Apr 18, 2009, 2:00:18 PM4/18/09

to

"Alf P. Steinbach" <al...@start.no> wrote:

> * Daniel T.:
> >
> > That said, your class is largely immutable, the sole mutators are
> > the two op+= and the two op= functions. If you went all the way
> > and removed all four of them, you could make your class even more
> > time efficient by using a data sharing implementation. If you
> > choose to keep these four functions, then maybe you should
> > consider including more functions for mutation and go with an
> > implementation that is more efficient in the face of mutation.
>
> Assignment/concatenation works well with immutable strings (don't
> know about the OP's code, though).
>
> It's much of the point.

Tell us more. Seems to me that assignment and concatenation aren't
approprate in an immutable type:

String a("hello");
String b = a;
assert(b.length() == a.length());
a += " world";
assert(b.length() == a.length()); // how could this not fire? and if it
does fire, how is 'a' immutable?

Alf P. Steinbach

unread,

Apr 18, 2009, 2:58:55 PM4/18/09

to

* Daniel T.:

> "Alf P. Steinbach" <al...@start.no> wrote:
>> * Daniel T.:
>>> That said, your class is largely immutable, the sole mutators are
>>> the two op+= and the two op= functions. If you went all the way
>>> and removed all four of them, you could make your class even more
>>> time efficient by using a data sharing implementation. If you
>>> choose to keep these four functions, then maybe you should
>>> consider including more functions for mutation and go with an
>>> implementation that is more efficient in the face of mutation.
>> Assignment/concatenation works well with immutable strings (don't
>> know about the OP's code, though).
>>
>> It's much of the point.
>
> Tell us more.

OK, but, how many of you are there? <g>

> Seems to me that assignment and concatenation aren't
> approprate in an immutable type:
>
> String a("hello");
> String b = a;
> assert(b.length() == a.length());

OK.

> a += " world";
> assert(b.length() == a.length()); // how could this not fire?

The assertion here does not hold.

> and if it does [not hold], how is 'a' immutable?

'a' refers to an immutable string value, a string value that can't be modified.

You can change which string value 'a' refers to.

You can not change the string value itself (although 'a' can, when it can
guarantee that it is the one and only reference, which it easily can).

That is the only practical usage of the word "immutable" for strings, and it is
(not just in fact but also necessarily) the meaning of "immutable" when we refer
to strings in other languages, or certain C++ string classes, as immutable.

Daniel T.

unread,

Apr 18, 2009, 6:40:01 PM4/18/09

to

SG <s.ges...@gmail.com> wrote:
> On 18 Apr., 19:19, "Alf P. Steinbach" <al...@start.no> wrote:
> >
> > Assignment/concatenation works well with immutable strings
>
> IMHO, "immutable strings" can be misunderstood.

Immutable: not subject or susceptible to change or variation in form or
quality or nature.

> I think by "strings" you refer to the internal char array
> representation as opposed to a user-defined string class object that
> manages this char array. I would still want to mutate those string
> objects in the same way I can change the value of some int variable.
> :)

In that case, what does it mean for the string to be immutable? After
all, you can think of an int as an array of bits, each of which can be
accessed individually and modified, should an 'immutable' string then
allow users to access and modify individual bytes (or bits for that
matter)?

Or are you saying that immutable strings are a bad idea? Maybe just
their length should be immutable, while their contents are mutable (like
with an int)? I'm not sure what that would buy us.

Daniel T.

unread,

Apr 18, 2009, 6:54:43 PM4/18/09

to

"Alf P. Steinbach" <al...@start.no> wrote:
> * Daniel T.:
> > "Alf P. Steinbach" <al...@start.no> wrote:
> > > * Daniel T.:
> > >
> > > > That said, your class is largely immutable, the sole mutators
> > > > are the two op+= and the two op= functions. If you went all
> > > > the way and removed all four of them, you could make your
> > > > class even more time efficient by using a data sharing
> > > > implementation. If you choose to keep these four functions,
> > > > then maybe you should consider including more functions for
> > > > mutation and go with an implementation that is more efficient
> > > > in the face of mutation.
> > >
> > > Assignment/concatenation works well with immutable strings
> > > (don't know about the OP's code, though).
> > >
> > > It's much of the point.
> >
> > Tell us more.
>
> OK, but, how many of you are there? <g>

Lot's of people read your sage advice, even if they agree with it. :-)

> > Seems to me that assignment and concatenation aren't appropriate in

> > an immutable type:
> >
> > String a("hello");
> > String b = a;
> > assert(b.length() == a.length());

> > a += " world";
> > assert(b.length() == a.length()); // how could this not fire?
>
> The assertion here does not hold.
>
> > and if it does [not hold], how is 'a' immutable?
>
> 'a' refers to an immutable string value, a string value that can't
> be modified.
>
> You can change which string value 'a' refers to.

This treats 'string' as a 'const char*' replacement doesn't it? But a
'const char*' is mutable (although that which it points to is not,) and
as such, calling 'string' immutable would be a misnomer.

> That is the only practical usage of the word "immutable" for
> strings, and it is (not just in fact but also necessarily) the
> meaning of "immutable" when we refer to strings in other languages,
> or certain C++ string classes, as immutable.

I can see your point when it comes to languages that treat variables as
pointers to objects (without the pointer arithmetic of course.) The
object may not be mutable, but the variable can be assigned to hold
other objects. But even in those cases, op+= isn't really appropriate is
it? And '.append()' would return a different object and thus require
re-assignment.

I can see your point. We could consider "that which a string contains"
as immutable and then any non-const member-function would reseat the
string rather than changing its contents, but if that was the case, why
stop with op= and op+=? Why not go ahead and implement a whole host of
other non-const functions?

Alf P. Steinbach

unread,

Apr 18, 2009, 7:26:54 PM4/18/09

to

* Daniel T.:
> "Alf P. Steinbach" <al...@start.no> wrote:
>> * Daniel T.:
>>> "Alf P. Steinbach" <al...@start.no> wrote:
>>>> * Daniel T.:
>>>>
>>>>> That said, your class is largely immutable, the sole mutators
>>>>> are the two op+= and the two op= functions. If you went all
>>>>> the way and removed all four of them, you could make your
>>>>> class even more time efficient by using a data sharing
>>>>> implementation. If you choose to keep these four functions,
>>>>> then maybe you should consider including more functions for
>>>>> mutation and go with an implementation that is more efficient
>>>>> in the face of mutation.
>>>> Assignment/concatenation works well with immutable strings
>>>> (don't know about the OP's code, though).
>>>>
>>>> It's much of the point.
>>> Tell us more.
>> OK, but, how many of you are there? <g>
>
> Lot's of people read your sage advice, even if they agree with it. :-)

He he. :)

Efficiency and the principle of least surprise.

op+= can be really, really efficient whether the string is immutable or not,
because it can provide time amortized linear in the total result length simply
by increasing the buffer by some factor (usually 2) on reallocation.

And for std::string it is efficient that way. Unfortunately, in common Java
implementations it reportedly isn't, even though it could very easily be. It
wouldn't really help with better Java implementations either, since what matters
is the *guaranteed* behavior, amortized linear, versus possibly quadratic.

However, for an immutable string an operation such as setChar( index, value )
would have O(n) behavior in the length of the string, instead of constant time.
It could have amortized constant time for the special case of repeatedly
modifying the same string a number of times proportional to the string length,
but it could then be surprising to most that in usual cases of doing single
char-changing the setChar operation would have linear time.

So assignment and concatenation (and not to forget, substring, at constant
time!) are easy to do very efficiently with essentially no surprise factor, no
gremlins jumping up and biting the careless programmer, while an operation such
as setChar could only be efficient for special cases.

kevints...@gmail.com

unread,

Apr 18, 2009, 10:26:33 PM4/18/09

to

"Daniel T." <dani...@earthlink.net> wrote in news:daniel_t-276FBB.
1239411...@earthlink.vsrv-sjc.supernews.net:

> In the functions: String::String(const int&) and String::String(const
> long&), you are putting the null terminator at value[size] but in other
> functions you are putting the null terminator at value[size + 1].

No, I allocate "size + 1" memory for all cases, and put the null
terminators at "size" position.

> Also, those two functions are identical. They shouldn't be identical,
> but even so, if properly implemented they would be almost identical.
> Consider breaking out the parts that are alike into a separate function.

Yea, they shouldn't be identical. But they seem to have the same
maximum value and minimum value in most platforms, so you can see that
almost all the code are identical besides the "::getLengthOfNumeric"
function invocation.

> For const wchar_t& String::wchar_tAt(int index) const... Doing something
> within a throw that might cause a throw (in this case, allocating
> memory) is bad form.

The function name was charAt(), I did a global replacement in the
whole file and not notice this until now.
The new String(L"out of bounds.") instantiation is hard coded, I can
ensure that no exceptions will be thrown as long as memory allocation
for this string succeeds. And for the case of insufficient memory, I
can't do anything, can I?

> String String::substring(int beginIndex, int endIndex) const... in the
> last constructor I mentioned above, you corrected offset and count when
> they were bad values, but here you throw if beginIndex and endIndex are
> bad values. Consistency is important.

You are right. For consistency, I may have to throw exceptions in all
cases.

> You have an "operator const wchar_t* () const" and a "const wchar_t*
> toCString() const" that both do the same thing. Others have already
> suggested you remove the former. Generally, having two member-functions
> that do the exact same thing is unnecessary duplication (though it
> sometimes makes sense in the face of inheritance, but even then one of
> the functions should be implemented in terms of the other.)

There was only a toCString() function at the first place, I added the
implicit conversion just for tasting the semantics sugar, I just knew
about implicit conversion two days ago, but I didn't know this would
cause problems in some cases.

> If you are going to consistently put a null terminator on the end of
> your 'value' array, you don't really need the 'size' variable at all;
> it's a rather minor optimization that is more than overwhelmed by the
> fact that your class has to reallocate the value array every time
> someone increases the length of the string.

I maintain this size variable just for that minor optimization and
convenient use, because in some cases I will have to query the length
of the string many times, I can simply call the length (which returns
size directly) function as many time as needed, and I don't need to
declare a new variable to hold the length of the string.

I provide two operator overloading functions that will reallocate
memory for the internal wchar_t*, this is just for convenient use too,
this class is supposed to be used as an unmutable one in most cases,
if I have to do lots of concatenation, I will use another
implementation of the this class, it is called a StringBuffer in Java,
which is a typical mutable string class.

kevints...@gmail.com

unread,

Apr 18, 2009, 10:37:42 PM4/18/09

to

But this is convenient when you need to do case conversions of English
words.

Martin Eisenberg

unread,

Apr 19, 2009, 6:53:54 AM4/19/09

to

kevints...@gmail.com wrote:

> String::String(const int& num) {
> if(num == minInteger)
> String(L"-2147483648");

That line makes a temporary object that is immediately destroyed
again. The String under construction does not get initialized.
Seemingly you haven't exercised this execution path. The only way to
really put code through its paces is programmatic testing; see "unit
test".

Also, I haven't seen a definition of minInteger in your post --
perhaps it's in Core.h, but arguably this is more obscure than
overtly mentioning std::numeric_limits here.

> value = new wchar_t[1];
> value[0] = '\0';

You keep writing that in any number of places. The definition of what
initializing to empty means should be expressed exactly once,
probably in the same private function you need to correct the above-
mentioned mistake.

> throw StringIndexOutOfBoundsException(
> new String(L"out of bounds."));

Even if the "new" doesn't fail as someone cautioned against, who will
delete the error message? Probably the exception class; better though
to make the question moot. Just store a pointer to the literal like
std exceptions do, or at least use a smart pointer.

Martin

--
Quidquid latine scriptum est, altum videtur.

James Kanze

unread,

Apr 19, 2009, 9:10:24 AM4/19/09

to

No. About the only use it could have is as an example of how
not to do the job. It fails on implementations not using ASCII,
for example, and it fails for English words like "naïve".

--
James Kanze (GABI Software) email:james...@gmail.com
Conseils en informatique orientée objet/
Beratung in objektorientierter Datenverarbeitung
9 place Sémard, 78210 St.-Cyr-l'École, France, +33 (0)1 30 23 00 34

James Kanze

unread,

Apr 19, 2009, 9:21:06 AM4/19/09

to

C++ supports value semantics, and a string class really should
behave as a value. This means no mutator functions, but does
allow using assignment to change the value. Because C++ also
supports things like += on built-in value types, one can also
argue that a string class should support it. On the other hand,
any possibility of modifying a string without using an = sign is
poor design (because of the lack of identity).

James Kanze

unread,

Apr 19, 2009, 9:22:53 AM4/19/09

to

On Apr 19, 12:40 am, "Daniel T." <danie...@earthlink.net> wrote:

> SG <s.gesem...@gmail.com> wrote:
> > On 18 Apr., 19:19, "Alf P. Steinbach" <al...@start.no> wrote:

> > > Assignment/concatenation works well with immutable strings

> > IMHO, "immutable strings" can be misunderstood.

> Immutable: not subject or susceptible to change or variation
> in form or quality or nature.

> > I think by "strings" you refer to the internal char array
> > representation as opposed to a user-defined string class
> > object that manages this char array. I would still want to
> > mutate those string objects in the same way I can change the
> > value of some int variable. :)

> In that case, what does it mean for the string to be
> immutable? After all, you can think of an int as an array of
> bits, each of which can be accessed individually and modified,
> should an 'immutable' string then allow users to access and
> modify individual bytes (or bits for that matter)?

You can assign to an int, to change its value. For that matter,
you can write things like i |= 0x40. (I'm not sure what the
equivalent should be with a string.)

James Kanze

unread,

Apr 19, 2009, 9:29:03 AM4/19/09

to

On Apr 18, 8:58 pm, "Alf P. Steinbach" <al...@start.no> wrote:
> * Daniel T.:

[...]

> You can change which string value 'a' refers to.

> You can not change the string value itself (although 'a' can,
> when it can guarantee that it is the one and only reference,
> which it easily can).

> That is the only practical usage of the word "immutable" for
> strings, and it is (not just in fact but also necessarily) the
> meaning of "immutable" when we refer to strings in other
> languages, or certain C++ string classes, as immutable.

I think I agree with you, but just to be sure we're on the same
wavelength: what you're basically saying is that (supposing all
variables have string type):

a = b ; // fine
a += b ; // also OK
a.replace( 5, 3, b ) ; // no, except that
a = a.replace( 5, 3, b ) ; // it's OK for a member
// function to return a new
// value, derived from the old.

Just curious, but what do you think about:
a.substr( 5, 3 ) = b ; // the equivalent of the last
// example, above.

(My pre-standard string class supported this; in fact, it
supported:
a( 5, 3 ) = b ; // same as a = a.replace( 5, 3, b )
But that was designed a very long time ago. I've not given much
thought to the question lately, as to whether I'd do the same
thing today.)

SG

unread,

Apr 19, 2009, 10:21:34 AM4/19/09

to

On 19 Apr., 15:21, James Kanze <james.ka...@gmail.com> wrote:
> On Apr 18, 7:46 pm, SG <s.gesem...@gmail.com> wrote:
>
> > IMHO, "immutable strings" can be misunderstood. I think by
> > "strings" you refer to the internal char array representation
> > as opposed to a user-defined string class object that manages
> > this char array. I would still want to mutate those string
> > objects in the same way I can change the value of some int
> > variable. :)
>
> C++ supports value semantics, and a string class really should
> behave as a value.

It just seems we attach a slightly different meaning to the expression
"value semantics" and "mutation".

> This means no mutator functions, but does

This is where we differ. I'm not talking about values but objects
that have a state that represents a value. In that respect operator=,
operator+= are mutators because the state of the object is mutated to
reflect a new value. This applies to built-in types, too, of course.
Mutating and value semantics are not mutually exclusive.

I thought this discussion is about a possible implementation of string
class (i.e. sharing reference-counted character arrays that are not
altered -- hence "immutable").

If you say "immutable string" but allow assignment and probably +=
then you have a certain idea of what a "string" is and what the thing
is you use for storing a string value. It doesn't seem to match the
"object can be in a state that represents a string value"-idea.

Do you see where I'm going with this?

> [...] On the other hand,

> any possibility of modifying a string without using an = sign is
> poor design (because of the lack of identity).

I count "poor design" as opinion. "lack of identidy" needs some more
explaining, especially with respect to how any string mutator would be
any different than an assignment or += for ints, for example. =, +=,
*= ... just have a special syntax. To avoid misunderstandings: By
"string" in "string mutator" I meant the string *object* and not a
string *value*.

I think this is really just a terminology/point-of-view issue w.r.t.
mutation and whether by "string" you mean "string object" or "string
value". So, if I assign a new "string value" to a "string object" I
mutate this object so that it represents the new string value.

Obviously the expression "immutable string" can be confusing. That's
why I raised some concern.

Cheers!
SG

Alf P. Steinbach

unread,

Apr 19, 2009, 11:39:06 AM4/19/09

to

* James Kanze:

> On Apr 18, 8:58 pm, "Alf P. Steinbach" <al...@start.no> wrote:
>> * Daniel T.:
>
> [...]
>> You can change which string value 'a' refers to.
>
>> You can not change the string value itself (although 'a' can,
>> when it can guarantee that it is the one and only reference,
>> which it easily can).
>
>> That is the only practical usage of the word "immutable" for
>> strings, and it is (not just in fact but also necessarily) the
>> meaning of "immutable" when we refer to strings in other
>> languages, or certain C++ string classes, as immutable.
>
> I think I agree with you, but just to be sure we're on the same
> wavelength: what you're basically saying is that (supposing all
> variables have string type):
>
> a = b ; // fine
> a += b ; // also OK
> a.replace( 5, 3, b ) ; // no, except that
> a = a.replace( 5, 3, b ) ; // it's OK for a member
> // function to return a new
> // value, derived from the old.

Yes.

> Just curious, but what do you think about:
> a.substr( 5, 3 ) = b ; // the equivalent of the last
> // example, above.

It can improve efficiency to have it, when it's done repeatedly.

However, I think it can be much more clear to /explicitly/ obtain a mutable
string or string buffer for that.

Then there's no surprise about the O(n) (for n length of a) lurking in there;
it's then clear that it may occur in explicitly obtaining the mutable string /
buffer.

Cheers,

Juha Nieminen

unread,

Apr 19, 2009, 12:07:05 PM4/19/09

to

There's quite a lot wrong with your program, but I'll note this:

kevints...@gmail.com wrote:
> String String::operator =(String& str) {
> return str;
> }

You are breaking a contract here. operator=() should make this object
equal to the given object, but you are, in fact, doing absolutely
nothing. Thus your 'String' class is horribly broken and cannot be used
eg. in STL containers.

Also operator=() should always return a reference to *this.

> bool String::operator ==(const String* s) const{
> return this == s;
> }

You are also breaking a contract here (or at the very least
convention). You are not comparing the contents of the strings, but the
pointers themselves. Two strings will compare unequal even if they
contain the exact same contents.

James Kanze

unread,

Apr 20, 2009, 3:51:32 AM4/20/09

to

On Apr 19, 5:39 pm, "Alf P. Steinbach" <al...@start.no> wrote:
> * James Kanze:

[...]

> > Just curious, but what do you think about:
> > a.substr( 5, 3 ) = b ; // the equivalent of the last
> > // example, above.

> It can improve efficiency to have it, when it's done repeatedly.

> However, I think it can be much more clear to /explicitly/
> obtain a mutable string or string buffer for that.

> Then there's no surprise about the O(n) (for n length of a)
> lurking in there; it's then clear that it may occur in
> explicitly obtaining the mutable string / buffer.

I wasn't thinking in terms of efficiency. From the very start,
I had a String and a StringBuilder class---you used the latter
to "construct" strings from characters.

When I introduced the above syntax into my String class, the
users found it very clever. I'd just learned about proxies, and
I found it very clever, too. Which pleased me at the time;
today, I tend to be suspicious of code which is too clever. On
the other hand, strings are so omnipresent that you can accept a
bit more irregularity in them, if it provides notational
convenience. (My strings used half open intervals, rather than
start position and length, and interpreted a negative value as
"from the end", so that a.substr( -3 ) returned a proxy
representing the last three characters in the string. Another
slightly too clever idea, but an enormous notational
convenience.)

Jorgen Grahn

unread,

Apr 21, 2009, 10:10:24 AM4/21/09

to

On Sat, 18 Apr 2009 04:31:10 -0700 (PDT), kevints...@gmail.com <kevints...@gmail.com> wrote:
> I am new to the C++ language, I implement this String class just for
> practising.
> I want to know what to improve to make it *qualified* to be used in
> real programs,

Documentation stating what a String represents, what purpose it has.
The word "string" can have many subtly different meanings.

Yours seems to be like std::basic_string<wchar_t>, but with the
notions of character case and blankspace added.

> or what code should be changed to make it more
> efficient...yea, I know you will recommend me to using std::string,
> (-:just practising...

Curiously, you have conversions from a lot of different types, but not
from std::basic_string itself. That's what I like the least, I think
-- that your class isolates itself from the rest of the C++ world.
(From algorithms, locales and iostreams too, I think.)

Personally, I would never attempt to write a general, reusable
container class. It seems fun and easy, but it's better left to
experts IMHO.

> Here's the code:
> #ifndef STRING_H_
> #define STRING_H_
> #include "Core.h"

>
> class String{
> private:
> wchar_t* value;
> int size;

> public:
> String();
> String(const wchar_t* src);
> String(const wchar_t* src, int offset, int count);
> String(String& s);
> String(const int& num);
> String(const long& l);
> String(const float& f);
> String(const double& d);
> String(const wchar_t& c);

explicit! These constructors are a disaster waiting to happen.
Been there, done that ...

/Jorgen

--
// Jorgen Grahn <grahn@ Ph'nglui mglw'nafh Cthulhu
\X/ snipabacken.se> R'lyeh wgah'nagl fhtagn!

Tony

unread,

Apr 29, 2009, 3:18:22 AM4/29/09

to

James Kanze wrote:
> On Apr 19, 4:37 am, kevintse.on...@gmail.com wrote:
>> On Apr 18, 7:52 pm, Obnoxious User <O...@127.0.0.1> wrote:
>>> is completely useless from my perspective. How about yours?
>

>>> String(L"��").toUpperCase(); // <-!?

>
>> But this is convenient when you need to do case conversions of
>> English words.
>
> No. About the only use it could have is as an example of how
> not to do the job. It fails on implementations not using ASCII,

> for example, and it fails for English words like "na�ve".

7-bit ASCII is your friend. OK, not *your* friend maybe, but mine for sure!

James Kanze

unread,

Apr 29, 2009, 6:01:34 AM4/29/09

to

On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
> James Kanze wrote:

> 7-bit ASCII is your friend. OK, not *your* friend maybe, but
> mine for sure!

7-bit ASCII is dead, as far as I can tell. Certainly none of
the machines I use use it. My (very ancient) Sparcs use ISO
8859-1, my Linux boxes UTF-8, and Windows UTF-16LE.

The reason is simple, of course: 7-bit ASCII (nor ISO 8859-1,
for that matter) doesn't suffice for any known language.
English, for example, normally distinguishes between opening and
closing quotes---an encoding which doesn't make this distinction
isn't usable for general purpose English. And of course,
regardless of the language, as soon as your program has to deal
with things like people's names, you need to deal with an
incredible number of accents.

Of course, I'm talking here about real programs, designed to be
used in production environments. If your goal is just a Sudoku
solver, then 7-bit ASCII is fine.

Juha Nieminen

unread,

Apr 30, 2009, 4:02:12 AM4/30/09

to

James Kanze wrote:
> The reason is simple, of course: 7-bit ASCII (nor ISO 8859-1,
> for that matter) doesn't suffice for any known language.
> English, for example, normally distinguishes between opening and
> closing quotes

English also has words with accented vowels, such as
http://www.merriam-webster.com/dictionary/naivete

James Kanze

unread,

Apr 30, 2009, 4:41:42 AM4/30/09

to

I know. I cited naïve myself.

In the past, all (or at least most) languages have made
compromises for typewritten text; a typewriter only has so many
keys, and each key can only produce two characters. (I don't
know how CJK languages handled this.) And each key would only
advance the carriage a fixed difference (if it advanced it at
all). So you end up with English without accents, with no
distinction between opening and closing quotes; French without
the oe ligature or accents on the capital letters; both French
and German without correct quotes; etc. Such things aren't
really acceptable today, however, since computers don't have
these restrictions. Roughly speaking, if you aren't using fixed
width fonts, you should be doing the rest of the typesetting
correctly as well, and that means (in English) naïve and déjà vu
with accents, distinct opening and closing quotes, and so on.

Juha Nieminen

unread,

Apr 30, 2009, 5:47:49 AM4/30/09

to

James Kanze wrote:
> In the past, all (or at least most) languages have made
> compromises for typewritten text; a typewriter only has so many
> keys, and each key can only produce two characters. (I don't
> know how CJK languages handled this.) And each key would only
> advance the carriage a fixed difference (if it advanced it at
> all). So you end up with English without accents, with no
> distinction between opening and closing quotes; French without
> the oe ligature or accents on the capital letters; both French
> and German without correct quotes; etc. Such things aren't
> really acceptable today, however, since computers don't have
> these restrictions. Roughly speaking, if you aren't using fixed
> width fonts, you should be doing the rest of the typesetting

> correctly as well, and that means (in English) na�ve and d�j� vu

> with accents, distinct opening and closing quotes, and so on.

The main problem is to choose a character set and encoding. This
introduces problems in programs.

Unicode seems to be the de-facto standard nowadays, but it's still far
from easy to write programs which would handle all possible unicode
characters without problems. Even if you used raw unicode values as your
internal encoding (using 4 bytes wide characters) or the UTF32 encoding,
you are still going to stumble across problems. That's because something
as simple-sounding as "advance 10 characters forward" is not simple with
unicode even if you use wide characters where each unicode value has
been allocated a fixed amount of bytes. That's because some unicode
characters don't actually represent independent characters, but you can
have compound characters, composed of two unicode values (which means
that to advance one character forward would mean skipping *two* values
rather than one).

Problems become more complicated if you want to use a less verbose
encoding to save memory, such as UTF-8 (optimal for most western
languages) or UTF-16 (optimal eg. for Japanese and other languages heavy
in non-ascii characters). Advancing forward in a piece of text becomes a
challenge.

nick_keigh...@hotmail.com

unread,

Apr 30, 2009, 6:25:12 AM4/30/09

to

On 30 Apr, 09:41, James Kanze <james.ka...@gmail.com> wrote:

<snip>

> In the past, all (or at least most) languages have made
> compromises for typewritten text; a typewriter only has so many
> keys, and each key can only produce two characters. (I don't
> know how CJK languages handled this.)

one solution:
http://en.wikipedia.org/wiki/Japanese_typewriter

<snip>

Tony

unread,

May 2, 2009, 6:10:57 AM5/2/09

to

James Kanze wrote:
> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
>> James Kanze wrote:
>
>> 7-bit ASCII is your friend. OK, not *your* friend maybe, but
>> mine for sure!
>
> 7-bit ASCII is dead, as far as I can tell. Certainly none of
> the machines I use use it.

It's an application-specific thing, not a machine-specific thing.

> My (very ancient) Sparcs use ISO
> 8859-1, my Linux boxes UTF-8, and Windows UTF-16LE.
>
> The reason is simple, of course: 7-bit ASCII (nor ISO 8859-1,
> for that matter) doesn't suffice for any known language.

Um, how about the C++ programming language!

> Of course, I'm talking here about real programs, designed to be
> used in production environments. If your goal is just a Sudoku
> solver, then 7-bit ASCII is fine.

Of course compilers and other software development tools are just toys. The
English alphabet has 26 characters. No more, no less.

Jerry Coffin

unread,

May 2, 2009, 1:53:30 PM5/2/09

to

In article <S_UKl.27869$Ws1....@nlpi064.nbdc.sbc.com>, to...@my.net
says...

[ ... ]

> > My (very ancient) Sparcs use ISO
> > 8859-1, my Linux boxes UTF-8, and Windows UTF-16LE.
> >
> > The reason is simple, of course: 7-bit ASCII (nor ISO 8859-1,
> > for that matter) doesn't suffice for any known language.
>
> Um, how about the C++ programming language!

Sorry, but no. If you look at $2.10, you'll see "universal-character-
name', which allows one to generate names using characters that don't
fall within the ASCII character set (or ISO 8859 for that matter). It's
_possible_ to encode the source code to a C++ program using only the
characters in (for one example) ISO 646, but it's painful at best.

It's a bit hard to say much about ASCII per se -- the standard has been
obsolete for a long time. Even the organization that formed it doesn't
exist any more.

> > Of course, I'm talking here about real programs, designed to be
> > used in production environments. If your goal is just a Sudoku
> > solver, then 7-bit ASCII is fine.
>
> Of course compilers and other software development tools are just toys.

You do have something of a point -- if you restrict your target audience
sufficiently, you can also restrict some of what is supports (such as
different character sets).

> The English alphabet has 26 characters. No more, no less.

Unfortunately statements like this weaken your point. By any reasonable
measure, the English alphabet contains at least 26 characters (upper and
lower case). Of course, even other western Euroean languages like French
and German require characters that aren't present in the English
alphabet, and the last I heard there were also at least a _few_ people
in places like China, Korea, Japan, the Arabian Peninsula, etc. -- and
most of them use languages in which the characters aren't even similar
to those in English.

--
Later,
Jerry.

The universe is a figment of its own imagination.

Jerry Coffin

unread,

May 2, 2009, 2:22:03 PM5/2/09

to

In article <MPG.24663688b...@news.sunsite.dk>,
jco...@taeus.com says...

[ ... ]

> > The English alphabet has 26 characters. No more, no less.
>
> Unfortunately statements like this weaken your point. By any reasonable
> measure, the English alphabet contains at least 26 characters (upper and
> lower case).

Oops -- of course that should have been "52" rather than 26.

James Kanze

unread,

May 3, 2009, 5:50:03 AM5/3/09

to

On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
> James Kanze wrote:
> > On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
> >> James Kanze wrote:

> >> 7-bit ASCII is your friend. OK, not *your* friend maybe,
> >> but mine for sure!

> > 7-bit ASCII is dead, as far as I can tell. Certainly none
> > of the machines I use use it.

> It's an application-specific thing, not a machine-specific
> thing.

That's true to a point---an application can even use EBCDIC,
internally, on any of these machines. In practice, however,
anything that leaves the program (files, printer output, screen
output) will be interpreted by other programs, and an
application will only be usable if it conforms to what these
programs expect.

Which isn't necessarily a trivial requirement. When I spoke of
the encodings used on my machines, I was refering very precisely
to those machines, when I'm logged into them, with the
environment I set up. Neither pure ASCII nor EBCDIC are
options, but there are a lot of other possibilities. Screen
output depends on the font being used (which as far as I know
can't be determined directly by a command line program running
in an xterm), printer output depends on what is installed and
configured on the printer (or in some cases, the spooling
system), and file output depends on the program which later
reads the file---which may differ depending on the program, and
what they do with the data. (A lot of programs in the Unix
world will use $LC_CTYPE to determine the encoding---which means
that if you and I read the same file, using the same program, we
may end up with different results.)

> > My (very ancient) Sparcs use ISO
> > 8859-1, my Linux boxes UTF-8, and Windows UTF-16LE.

> > The reason is simple, of course: 7-bit ASCII (nor ISO 8859-1,
> > for that matter) doesn't suffice for any known language.

> Um, how about the C++ programming language!

C++ accepts ISO/IEC 10646 in comments, string and character
literals, and symbol names. It allows the implementation to do
more or less what it wants with the input encoding, as long as
it interprets universal character names correctly. (How a good
implementation should determine the input encoding is still an
open question, IMHO. All of the scanning tools I write use
UTF-8 internally, and I have transcoding filebuf's which convert
any of the ISO 8859-n, UTF-16 (BE or LE) or UTF-32 (BE or LE)
into UTF-8. On the other hand, all of my tools depend on the
client code telling them which encoding to use; I have some code
floating around somewhere which supports "intelligent guessing",
but it's not really integrated into the rest.)

> > Of course, I'm talking here about real programs, designed to
> > be used in production environments. If your goal is just a
> > Sudoku solver, then 7-bit ASCII is fine.

> Of course compilers and other software development tools are
> just toys. The English alphabet has 26 characters. No more, no
> less.

C, C++, Java and Ada all accept the Unicode character set, in
one form or another. (Ada, and maybe Java, limit it to the
first BMP.) I would think that this is pretty much the case for
any modern programming language.

Tony

unread,

May 7, 2009, 8:04:12 PM5/7/09

to

Jerry Coffin wrote:
> In article <MPG.24663688b...@news.sunsite.dk>,
> jco...@taeus.com says...
>
> [ ... ]
>
>>> The English alphabet has 26 characters. No more, no less.
>>
>> Unfortunately statements like this weaken your point. By any
>> reasonable measure, the English alphabet contains at least 26
>> characters (upper and lower case).
>
> Oops -- of course that should have been "52" rather than 26.

Ooops, me too: I meant "letters" and not "characters".

Tony

unread,

May 7, 2009, 8:02:57 PM5/7/09

to

Jerry Coffin wrote:
> In article <S_UKl.27869$Ws1....@nlpi064.nbdc.sbc.com>, to...@my.net
> says...
>
> [ ... ]
>
>>> My (very ancient) Sparcs use ISO
>>> 8859-1, my Linux boxes UTF-8, and Windows UTF-16LE.
>>>
>>> The reason is simple, of course: 7-bit ASCII (nor ISO 8859-1,
>>> for that matter) doesn't suffice for any known language.
>>
>> Um, how about the C++ programming language!
>
> Sorry, but no. If you look at $2.10,

"An identifier is an arbitrarily long sequence of letters and digits. Each
universal-character-name in an
identifier shall designate a character whose encoding in ISO 10646 falls
into one of the ranges specified in
Annex A of TR 10176:2003. Upper- and lower-case letters are different. All
characters are significant."

> you'll see "universal-character-
> name', which allows one to generate names using characters that don't
> fall within the ASCII character set (or ISO 8859 for that matter).

Fine, but for an environment or project that has determined that ASCII is
adequate, why in the world would they do that? (And moreso, why would anyone
ever do that?).

> It's _possible_ to encode the source code to a C++ program using only
> the characters in (for one example) ISO 646, but it's painful at best.

Explain.

>
> It's a bit hard to say much about ASCII per se -- the standard has
> been obsolete for a long time. Even the organization that formed it
> doesn't exist any more.

Oh? Is that why such care was taken with the Unicode spec to make sure that
it mapped nicely onto ASCII? ASCII will never die. It is fundamental and
foundational and for lots of programs, complete (read: all that is
necessary).

>
>>> Of course, I'm talking here about real programs, designed to be
>>> used in production environments. If your goal is just a Sudoku
>>> solver, then 7-bit ASCII is fine.
>>
>> Of course compilers and other software development tools are just
>> toys.
>
> You do have something of a point -- if you restrict your target
> audience sufficiently, you can also restrict some of what is supports
> (such as different character sets).

There is a large set of programs that fall in that category.

>
>> The English alphabet has 26 characters. No more, no less.
>
> Unfortunately statements like this weaken your point. By any
> reasonable measure, the English alphabet contains at least 26
> characters (upper and lower case).

Fine, upper and lower case then. But no umlauts or accent marks!

> Of course, even other western
> Euroean languages like French and German require characters that
> aren't present in the English alphabet, and the last I heard there
> were also at least a _few_ people in places like China, Korea, Japan,
> the Arabian Peninsula, etc. -- and most of them use languages in
> which the characters aren't even similar to those in English.

That passage seems non-sequitur: the whole gist was "what if one has
established that English is an appropriate simplifying assumption?".

Tony

unread,

May 7, 2009, 9:02:25 PM5/7/09

to

James Kanze wrote:
> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
>> James Kanze wrote:
>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
>>>> James Kanze wrote:
>
>>>> 7-bit ASCII is your friend. OK, not *your* friend maybe,
>>>> but mine for sure!
>
>>> 7-bit ASCII is dead, as far as I can tell. Certainly none
>>> of the machines I use use it.
>
>> It's an application-specific thing, not a machine-specific
>> thing.
>
> That's true to a point---an application can even use EBCDIC,
> internally, on any of these machines. In practice, however,
> anything that leaves the program (files, printer output, screen
> output) will be interpreted by other programs, and an
> application will only be usable if it conforms to what these
> programs expect.

But there is a huge volume of programs that can and do use just ASCII text.
I gave the example of development tools: parsers, etc. Sure, the web isn't
just ASCII, but that is just an application domain. If that is the target,
then I'll use UnicodeString instead of ASCIIString. I certainly don't want
all the overhead and complexity of Unicode in ASCIIString though. It has too
many valid uses to have to be bothered with a mountain of unnecessary stuff
if being subsumed into the "one size fits all" monstrosity.

>
> Which isn't necessarily a trivial requirement.

On that we agree 100%! That's the rationale for keeping ASCIIString
unaberrated.

> When I spoke of
> the encodings used on my machines, I was refering very precisely
> to those machines, when I'm logged into them, with the
> environment I set up. Neither pure ASCII nor EBCDIC are
> options, but there are a lot of other possibilities. Screen
> output depends on the font being used (which as far as I know
> can't be determined directly by a command line program running
> in an xterm), printer output depends on what is installed and
> configured on the printer (or in some cases, the spooling
> system), and file output depends on the program which later
> reads the file---which may differ depending on the program, and
> what they do with the data. (A lot of programs in the Unix
> world will use $LC_CTYPE to determine the encoding---which means
> that if you and I read the same file, using the same program, we
> may end up with different results.)

I don't get what you mean: an ASCII text file is still an ASCII text file no
matter what font the user chooses in Notepad, e.g. Internally, the program
is still working with ASCII strings, assuming English is the language (PURE
English that recognizes only 26 letters, that is). Nor does it matter that
the platform is Wintel where "behind the scenes" the OS is all UTF-16.

>
>>> My (very ancient)

(Aside Trivia: The "failure" of Sun has been attributed in part to the
unwillingness to move to x86 while "the industry" went there. Very ancient
indeed!).

>>> Sparcs use ISO
>>> 8859-1, my Linux boxes UTF-8, and Windows UTF-16LE.
>
>>> The reason is simple, of course: 7-bit ASCII (nor ISO 8859-1,
>>> for that matter) doesn't suffice for any known language.

The application domain you reference is: Operating System. Quite different
from CSV text file parser. Your statement could be misleading even if you
didn't intend it to be. The "any known language.. blah, blah", is a
generalization that fits the real world, but software programs eventually
are just "zeros and ones". The above from you is an odd perspective noting
that in another thread you were trying to shoehorn something with,
logically, magnitude and direction into a signed integral type.

>
>> Um, how about the C++ programming language!
>
> C++ accepts ISO/IEC 10646 in comments, string and character
> literals, and symbol names.

That's a good expansion point. Let's look the constituents...

Comments and Symbols: If you want to program in French or 7-bit kanji (The
Matrix?), have at it. I guarantee you that I'll never ever use/need 10646
comments or symbols. I'll be nice and call it a simplifying assumption but
it's really a "no brainer".

Literals: Not a problem for me, and can be worked around for others (put in
file or something: make it data because that's what it is. Programming in
French is hard).

Major advantage for me in programming: English is my primary language! (Curb
all the jokes please! ;P). Trying to extend programming (as I know it) to
other languages is not my goal. It may be someone else's proverbial "noble"
goal.

[snip... must one indicate snips?]

>
>>> Of course, I'm talking here about real programs, designed to
>>> be used in production environments. If your goal is just a
>>> Sudoku solver, then 7-bit ASCII is fine.
>
>> Of course compilers and other software development tools are
>> just toys. The English alphabet has 26 characters. No more, no
>> less.
>
> C, C++, Java and Ada all accept the Unicode character set, in
> one form or another.

There's that operating system example again that doesn't apply to hardly all
application development.

> (Ada, and maybe Java, limit it to the
> first BMP.) I would think that this is pretty much the case for
> any modern programming language.

You are interfusing programming languages with the data that they
manipulate. Perhaps also trying to keep the concept of GP Programming
Language alive even though this very discussion shows that that is not best.

Richard Herring

unread,

May 8, 2009, 5:26:21 AM5/8/09

to

In message <UWOMl.18752$8_3....@flpi147.ffdc.sbc.com>, Tony
<to...@my.net> writes
>Jerry Coffin wrote:

[...]

>>
>> It's a bit hard to say much about ASCII per se -- the standard has
>> been obsolete for a long time. Even the organization that formed it
>> doesn't exist any more.
>
>Oh? Is that why such care was taken with the Unicode spec to make sure that
>it mapped nicely onto ASCII?

Or ISO-8859?

[...]

>
>>
>>> The English alphabet has 26 characters. No more, no less.
>>
>> Unfortunately statements like this weaken your point. By any
>> reasonable measure, the English alphabet contains at least 26
>> characters (upper and lower case).
>
>Fine, upper and lower case then. But no umlauts or accent marks!

How na�ve. My _English_ dictionary includes d�j� vu, g�teau and many
other words with diacritics.

>
>> Of course, even other western
>> Euroean languages like French and German require characters that
>> aren't present in the English alphabet, and the last I heard there
>> were also at least a _few_ people in places like China, Korea, Japan,
>> the Arabian Peninsula, etc. -- and most of them use languages in
>> which the characters aren't even similar to those in English.
>
>That passage seems non-sequitur: the whole gist was "what if one has
>established that English is an appropriate simplifying assumption?".

Then one still needs some diacritics. The ISO-8859 family has them;
ASCII doesn't.

--
Richard Herring

James Kanze

unread,

May 8, 2009, 9:48:35 AM5/8/09

to

On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
> James Kanze wrote:
> > On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
> >> James Kanze wrote:
> >>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
> >>>> James Kanze wrote:

> >>>> 7-bit ASCII is your friend. OK, not *your* friend maybe,
> >>>> but mine for sure!

> >>> 7-bit ASCII is dead, as far as I can tell. Certainly none
> >>> of the machines I use use it.

> >> It's an application-specific thing, not a machine-specific
> >> thing.

> > That's true to a point---an application can even use EBCDIC,
> > internally, on any of these machines. In practice, however,
> > anything that leaves the program (files, printer output,
> > screen output) will be interpreted by other programs, and an
> > application will only be usable if it conforms to what these
> > programs expect.

> But there is a huge volume of programs that can and do use
> just ASCII text.

There is a huge volume of programs that can and do use no text.
However, I don't know of any program today that uses text in
ASCII; text is used to communicate with human beings, and ASCII
isn't sufficient for that.

> I gave the example of development tools: parsers, etc.

Except that the examples are false. C/C++/Java and Ada require
Unicode. Practically everything on the network is UTF-8.
Basically, except for some historical tools, ASCII is dead.

> Sure, the web isn't just ASCII, but that is just an
> application domain. If that is the target, then I'll use
> UnicodeString instead of ASCIIString. I certainly don't want
> all the overhead and complexity of Unicode in ASCIIString
> though. It has too many valid uses to have to be bothered with
> a mountain of unnecessary stuff if being subsumed into the
> "one size fits all" monstrosity.

As long as you're the only person using your code, you can do
what you want.

> > Which isn't necessarily a trivial requirement.

> On that we agree 100%! That's the rationale for keeping
> ASCIIString unaberrated.

I understand the rationale.

> > When I spoke of the encodings used on my machines, I was
> > refering very precisely to those machines, when I'm logged
> > into them, with the environment I set up. Neither pure
> > ASCII nor EBCDIC are options, but there are a lot of other
> > possibilities. Screen output depends on the font being used
> > (which as far as I know can't be determined directly by a
> > command line program running in an xterm), printer output
> > depends on what is installed and configured on the printer
> > (or in some cases, the spooling system), and file output
> > depends on the program which later reads the file---which
> > may differ depending on the program, and what they do with
> > the data. (A lot of programs in the Unix world will use
> > $LC_CTYPE to determine the encoding---which means that if
> > you and I read the same file, using the same program, we may
> > end up with different results.)

> I don't get what you mean: an ASCII text file is still an
> ASCII text file no matter what font the user chooses in
> Notepad, e.g.

First, there is no such thing as an ASCII text file. For that
matter, under Unix, there is no such thing as a text file. A
file is a sequence of bytes. How those bytes are interpreted
depends on the application. Most Unix tools expect text, in an
encoding which depends on the environment ($LC_CTYPE, etc.).
Most Unix tools delegate display to X, passing the bytes on to
the window manager "as is". And all Unix tools delegate to the
spooling system or the printer for printing, again, passing the
bytes on "as is" (more or less---the spooling system often has
some code translation in it). None of these take into
consideration what you meant when you wrote the file.

> Internally, the program is still working with ASCII strings,
> assuming English is the language (PURE English that recognizes
> only 26 letters, that is).

Pure English has accented characters in some words (at least
according to Merriam Webster, for American English). Pure
English distiguishes between open and closing quotes, both
single and double. Real English distinguishes between a hyphen,
an en dash and an em dash.

But that's all irrelevant, because in the end, you're writing
bytes, and you have to establish some sort of agreement between
what you mean by them, and what the programs reading the data
mean. (*If* we could get by with only the characters in
traditional ASCII, it would be nice, because for historical
reasons, most of the other encodings encountered encode those
characters identically. Realistically, however, any program
dealing with text has to support more, or nobody will use it.)

> Nor does it matter that the platform is Wintel where "behind
> the scenes" the OS is all UTF-16.

> >>> My (very ancient)

> (Aside Trivia: The "failure" of Sun has been attributed in
> part to the unwillingness to move to x86 while "the industry"
> went there. Very ancient indeed!).

Where did you get that bullshit? Sun does sell x86 processors
(using the AMD chip). And IBM and HP are quite successful with
there lines of non x86 processors. (IMHO, where Sun went wrong
was in abandoning its traditional hardware market, and moving
into software adventures like Java.)

> >>> Sparcs use ISO 8859-1, my Linux boxes UTF-8, and Windows
> >>> UTF-16LE.

> >>> The reason is simple, of course: 7-bit ASCII (nor ISO
> >>> 8859-1, for that matter) doesn't suffice for any known
> >>> language.

> The application domain you reference is: Operating System.
> Quite different from CSV text file parser.

I'm not referencing any application domain in particular.
Practically all of the Unix applications I know take the
encoding from the environment; those that don't use UTF-8 (the
more recent ones, anyway). All of the Windows applications I
know use UTF-16LE.

Do you think anyone would use MS Office or Open Office if they
only supported ASCII?

> Your statement could be misleading even if you didn't intend
> it to be. The "any known language.. blah, blah", is a
> generalization that fits the real world,

Yes. That's where I live and work. In the real world. I
produce programs that other people use. (In practice, my
programs don't usually deal with text, except maybe to pass it
through, so I'm not confronted with the problem that often. But
often enough to be aware of it.)

> but software programs eventually are just "zeros and ones".

Not really. Programs assign semantics to those ones and zeros.
Even at the hardware level---a float and an int may contain the
same number of bits, but the code uses different instructions
with them. Programs interpret the data.

Which brings us back to my point above---you don't generally
control how other programs are going to interpret the data you
write.

> The above from you is an odd perspective noting that in
> another thread you were trying to shoehorn something with,
> logically, magnitude and direction into a signed integral
> type.

Sorry, I don't know what you're talking about.

> >> Um, how about the C++ programming language!

> > C++ accepts ISO/IEC 10646 in comments, string and character
> > literals, and symbol names.

> That's a good expansion point. Let's look the constituents...

> Comments and Symbols: If you want to program in French or
> 7-bit kanji (The Matrix?), have at it.

I've already had to deal with C with the symbols in Kanji. That
would have been toward the end of the 1980s. And I haven't seen
a program in the last ten years which didn't use symbols and
have comments in either French or German.

> I guarantee you that I'll never ever use/need 10646 comments
> or symbols.

Fine. If you write a compiler, and you're the only person to
use it, you can do whatever you want. But there's no sense in
talking about it here, since it has no relevance in the real
world.

> I'll be nice and call it a simplifying assumption but it's
> really a "no brainer".

> Literals: Not a problem for me, and can be worked around for
> others (put in file or something: make it data because that's
> what it is. Programming in French is hard).

No it's not. (Actually, the most difficult language to program
in is English, because so many useful words are reserved as key
words. When I moved to C++, from C, I got hit several times in
the code written in English, by things like variables named
class. Never had that problem the French classe, nor the German
Klasse.)

> Major advantage for me in programming: English is my primary
> language!

It's one of my primarly languages as well. Not the only one,
obviously, but one of them.

> (Curb all the jokes please! ;P). Trying to extend programming
> (as I know it) to other languages is not my goal. It may be
> someone else's proverbial "noble" goal.

> [snip... must one indicate snips?]

> >>> Of course, I'm talking here about real programs, designed to
> >>> be used in production environments. If your goal is just a
> >>> Sudoku solver, then 7-bit ASCII is fine.

> >> Of course compilers and other software development tools
> >> are just toys. The English alphabet has 26 characters. No
> >> more, no less.

> > C, C++, Java and Ada all accept the Unicode character set,
> > in one form or another.

> There's that operating system example again that doesn't apply
> to hardly all application development.

That has nothing to do with the operating system. Read the
language standards.

> > (Ada, and maybe Java, limit it to the first BMP.) I would
> > think that this is pretty much the case for any modern
> > programming language.

> You are interfusing programming languages with the data that
> they manipulate.

No. Do you know any of the languages in question? All of them
clearly require support for at least the first BMP of Unicode in
the compiler. You may not use that possibility---a lot of
people don't---but it's a fundamental part of the language.
(FWIW: I think that C++ was the first to do so.)

Jerry Coffin

unread,

May 8, 2009, 2:12:03 PM5/8/09

to

In article <UWOMl.18752$8_3....@flpi147.ffdc.sbc.com>, to...@my.net
says...

[ ... ]

> Fine, but for an environment or project that has determined that ASCII is
> adequate, why in the world would they do that? (And moreso, why would anyone
> ever do that?).

Who has ever determined that ASCII was adequate? ASCII was never
anythint more than a stopgap -- a compromise between what was wanted,
and what you could reasonably support at a time when a machine with 32K
of RAM and (if you were really lucky) a 40 megabyte hard-drive needed to
support a few hundred simultaneous users because it cost well over a
million dollars.

ASCII has been obsolete for decades -- let is rest in peace.

> > It's _possible_ to encode the source code to a C++ program using only
> > the characters in (for one example) ISO 646, but it's painful at best.
>
> Explain.

Look up trigraphs and digraphs. They were invented specifically because
ISO 646 doesn't include all the characters normally used in C or C++
source code.

[ ... ]

> Oh? Is that why such care was taken with the Unicode spec to make sure that
> it mapped nicely onto ASCII? ASCII will never die. It is fundamental and
> foundational and for lots of programs, complete (read: all that is
> necessary).

You're right about one or two points, but not in the way you think. For
example, it's true that ASCII won't die -- but only because it's already
been dead and buried for decaded. Unicode and ISO 10646 weren't written
particularly to be compatible with ASCII -- they were written to be
compatible with the common base area of ISO 8859. Claiming that's
"ASCII" does nothing more than display ignorance of both standards.

[ ... ]

> > You do have something of a point -- if you restrict your target
> > audience sufficiently, you can also restrict some of what is supports
> > (such as different character sets).
>
> There is a large set of programs that fall in that category.

I suppose that depends on how you define "large". My immediate guess
would be that it's a single-digit percentage.

[ ... ]

> > Of course, even other western
> > Euroean languages like French and German require characters that
> > aren't present in the English alphabet, and the last I heard there
> > were also at least a _few_ people in places like China, Korea, Japan,
> > the Arabian Peninsula, etc. -- and most of them use languages in
> > which the characters aren't even similar to those in English.
>
> That passage seems non-sequitur: the whole gist was "what if one has
> established that English is an appropriate simplifying assumption?".

Quite the contrary -- the point was that IF you've determined that you
can use only a subset of the English alphabet, that's fine -- but can
almost never determine any such thing.

James Kanze

unread,

May 9, 2009, 7:08:42 AM5/9/09

to

On May 8, 8:12 pm, Jerry Coffin <jcof...@taeus.com> wrote:
> In article <UWOMl.18752$8_3.6...@flpi147.ffdc.sbc.com>, t...@my.net
> says...

> [ ... ]

> > Oh? Is that why such care was taken with the Unicode spec to
> > make sure that it mapped nicely onto ASCII? ASCII will never
> > die. It is fundamental and foundational and for lots of
> > programs, complete (read: all that is necessary).

> You're right about one or two points, but not in the way you
> think. For example, it's true that ASCII won't die -- but only
> because it's already been dead and buried for decaded. Unicode
> and ISO 10646 weren't written particularly to be compatible
> with ASCII -- they were written to be compatible with the
> common base area of ISO 8859. Claiming that's "ASCII" does
> nothing more than display ignorance of both standards.

And the common base area of ISO 8859 was compatible with ASCII.
Historically, this was an issue: when ISO 8859 was introduced,
we still wanted to be able to read and interpret existing files,
and even today, a file written using just the printable
characters from ASCII will encode the same in all of the ISO
8859 encodings and in UTF-8. A useful characteristic if you
want to determine the encoding from the contents of the file
(e.g. as in XML)---you limit the characters in the file to just
this small set until you've specified the encoding, and the
parsing code doesn't have to commit to the actual encoding until
after it has read the specification.

> [ ... ]

> > > You do have something of a point -- if you restrict your
> > > target audience sufficiently, you can also restrict some
> > > of what is supports (such as different character sets).

> > There is a large set of programs that fall in that category.

> I suppose that depends on how you define "large". My immediate
> guess would be that it's a single-digit percentage.

Of those programs dealing with text. If you include all
programs, I suspect that most programs (e.g. the one which
controls the ignition in your car) don't use any character data
at all, so strictly speaking, they don't need more than plain
ASCII (since they don't even need that).

Of course, that's totally irrelevant to the argument about which
encoding to use for text data. (For what its worth, I've seen
more EBCDIC in the last ten years than I've seen ASCII.)

Jerry Coffin

unread,

May 9, 2009, 9:42:51 AM5/9/09

to

In article <0d0c0fbc-4417-46c0-bb6d-
b06450...@l5g2000vbc.googlegroups.com>, james...@gmail.com says...

[ ... ]

> And the common base area of ISO 8859 was compatible with ASCII.

That depends on exactly what you mean by "compatible with". It's not
identical to ASCII though. For one example, in ASCII character 96 is a
reverse quote, but in ISO 8859 it's a grave accent.

I suppose you can argue that those are the same thing if you want --
none of the encoding standards makes any requirement about the glyphs
used to display a particular character, so they could perfectly well be
displayed with identical glyphs. Nonetheless, the two do not share the
same intent.

[ ... ]

> Of those programs dealing with text. If you include all
> programs, I suspect that most programs (e.g. the one which
> controls the ignition in your car) don't use any character data
> at all, so strictly speaking, they don't need more than plain
> ASCII (since they don't even need that).

Well, yes - given that the discussion was about text encoding, I treated
the universe as programs that work with encoded text in some way.

Alf P. Steinbach

unread,

May 9, 2009, 11:32:34 AM5/9/09

to

* Jerry Coffin:

> In article <0d0c0fbc-4417-46c0-bb6d-
> b06450...@l5g2000vbc.googlegroups.com>, james...@gmail.com says...
>
> [ ... ]
>
>> And the common base area of ISO 8859 was compatible with ASCII.
>
> That depends on exactly what you mean by "compatible with". It's not
> identical to ASCII though. For one example, in ASCII character 96 is a
> reverse quote, but in ISO 8859 it's a grave accent.

I'm sorry but as far as I know that's BS. :-)

Would be nice to know where you picked up that piece of disinformation, though.

Or whether we're all ("we" = me, Wikipedia, James, etc.) all wrong...

> I suppose you can argue that those are the same thing if you want --
> none of the encoding standards makes any requirement about the glyphs
> used to display a particular character, so they could perfectly well be
> displayed with identical glyphs. Nonetheless, the two do not share the
> same intent.

On the contrary, AFAIK the intent of ISO 8859-1 was to contain ASCII sans the
control characters directly as a subset.

Jerry Coffin

unread,

May 9, 2009, 12:10:29 PM5/9/09

to

In article <gu47qc$s28$1...@news.motzarella.org>, al...@start.no says...

[ ... ]

> I'm sorry but as far as I know that's BS. :-)

Have you looked at both specifactions to find out? Have you even looked
at one of them?

> Would be nice to know where you picked up that piece of disinformation, though.

It would be nice to know exactly what convinces you that it's
disinformation, and particularly whether you have any authoritative
source for the claim. Wikipedia certainly doesn't qualify, and as much
respect as I have to James, I don't think he does either. It would
appear to me that the only authoritative sources on the subject are the
standards themselves -- and your statement leads me to doubt that you've
consulted them in this case.

Alf P. Steinbach

unread,

May 9, 2009, 1:01:38 PM5/9/09

to

* Jerry Coffin:

You're reversing the burden of evidence.

You made an outrageous claim, which if it were true would make ISO 8859-1 a very
impractical standard; now please submit your evidence that you think is in favor
of that claim.

Cheers & hth.,

Jerry Coffin

unread,

May 9, 2009, 2:37:07 PM5/9/09

to

In article <gu4d1c$eb2$1...@news.motzarella.org>, al...@start.no says...

[ ... ]

> You're reversing the burden of evidence.
>
> You made an outrageous claim, which if it were true would make ISO 8859-1 a very
> impractical standard; now please submit your evidence that you think is in favor
> of that claim.

I thought I'd made it clear, but the evidence is the standards
themselves. If, by "submit" you mean posting them here, I obviously
can't do that -- they're all copyrighted, as I'm sure you're already
well aware.

As for rendering anything impractical, I don't think it does anything of
the sort. Quite the contrary, there's not likely to be any practical
effect at all -- what you get is pretty much the same regardless of what
name the standard chooses to give it.

Ultimately, this isn't particularly different from the '.' character --
we use it both as a period (full stop/end of sentence marker) and a
decimal point. Whether some particular document calls it a "decimal
point" or "period" or "full stop" makes little real difference to how
people actually put it to use. A standard that chose one name over the
ohter might reflect the cultural background of its designers, but
wouldn't be particularly likely to render that standard any more or less
practical.

Alf P. Steinbach

unread,

May 9, 2009, 2:45:56 PM5/9/09

to

* Jerry Coffin:

> In article <gu4d1c$eb2$1...@news.motzarella.org>, al...@start.no says...
>
> [ ... ]
>
>> You're reversing the burden of evidence.
>>
>> You made an outrageous claim, which if it were true would make ISO 8859-1 a very
>> impractical standard; now please submit your evidence that you think is in favor
>> of that claim.
>
> I thought I'd made it clear, but the evidence is the standards
> themselves. If, by "submit" you mean posting them here, I obviously
> can't do that -- they're all copyrighted, as I'm sure you're already
> well aware.

I'm sorry on your behalf, but quoting a limited part of a standard is fair use,
so there's nothing stopping you from that.

In passing, note that the error in your reasoning started with the "obviously";
that little code-word often signals an error of reasoning.

And in case you doubt that quoting is fair use, note that in this group we often
quote from the C++ standard -- perhaps you have done so yourself, earlier?

> As for rendering anything impractical, I don't think it does anything of
> the sort. Quite the contrary, there's not likely to be any practical
> effect at all -- what you get is pretty much the same regardless of what
> name the standard chooses to give it.
>
> Ultimately, this isn't particularly different from the '.' character --
> we use it both as a period (full stop/end of sentence marker) and a
> decimal point. Whether some particular document calls it a "decimal
> point" or "period" or "full stop" makes little real difference to how
> people actually put it to use. A standard that chose one name over the
> ohter might reflect the cultural background of its designers, but
> wouldn't be particularly likely to render that standard any more or less
> practical.

Assuming for the sake of argument that the two standards use different terms to
describe character 96, since it seems you're reluctant to offer any evidence,
almost as if the implication that you have these standards wasn't true.

Is your point that the two standards use different terms for the same thing?

In that case either your argument earlier in the thread was misleading, or your
current argument is misleading.

Or is your point that the two standards use different terms with the intention
to denote two different things?

In that case you have misunderstood the standards.

Jerry Coffin

unread,

May 9, 2009, 4:14:47 PM5/9/09

to

In article <gu4j4v$6pv$1...@news.motzarella.org>, al...@start.no says...

[ ... ]

> I'm sorry on your behalf, but quoting a limited part of a standard is fair use,
> so there's nothing stopping you from that.

I've already quoted the relevant parts. Each has a table of numbers and
the character associated with each number. In the ASCII table, it's
listed as a backward quote. In the ISO 8859 table, it's listed as a
grave accent.

> In passing, note that the error in your reasoning started with the "obviously";
> that little code-word often signals an error of reasoning.

There was no error in reasoning.

> And in case you doubt that quoting is fair use, note that in this group we often
> quote from the C++ standard -- perhaps you have done so yourself, earlier?

I have no problem with fair use, or quoting relevant portions. In this
case, there's no other explanatory text, so I've already quoted
everything I can find that's relevant.

[ ... ]

> Assuming for the sake of argument that the two standards use different terms to
> describe character 96, since it seems you're reluctant to offer any evidence,
> almost as if the implication that you have these standards wasn't true.

I'm not sure what further evidence would be relevant -- I've already
quoted what each says on the subject. Neither appears to have anything
beyond the single-word description of that particular character.

> Is your point that the two standards use different terms for the same thing?
>
> In that case either your argument earlier in the thread was misleading, or your
> current argument is misleading.
>
> Or is your point that the two standards use different terms with the intention
> to denote two different things?
>
> In that case you have misunderstood the standards.

"I'm sorry, but as far as I know, that's BS."

It seems quite incredible for you to claim certainty about the intent of
the standard, especially one that you've apparently never even seen.

I don't claim clarivoyance, so I can only go by what's in the standards
themselves. The text is different, and not in a way I can reasonably
attribute to a typo or anything like that. This seems to support the
belief that there was a real intent to change the meaning to at least
some degree.

If you're honestly interested in the question of what constitutes a
difference between characters at the level of abstraction used in an
encoding standard, I'd advise googling for Han Unification. Early on,
Unicode used Han Unification to reduce the number of code points
necessary for the Chinese, Japanese and Korean alphabets. Considerable
controversy resulted, all based around the question of where to draw the
line between characters that were the same or different.

Alf P. Steinbach

unread,

May 9, 2009, 5:18:59 PM5/9/09

to

* Jerry Coffin:

> In article <gu4j4v$6pv$1...@news.motzarella.org>, al...@start.no says...
>
> [ ... ]
>
>> I'm sorry on your behalf, but quoting a limited part of a standard is fair use,
>> so there's nothing stopping you from that.
>
> I've already quoted the relevant parts.

You have as yet not quoted anything, at least not to me.

I don't care what you quoted some years ago to someone else in a another venue.

The original final ASCII standard from 1967 is no longer available so I'm
surprised you claim to have it.

As an alternative you might take a look at

<url: http://wps.com/projects/codes/Revised-ASCII/page3.JPG>.

> Each has a table of numbers and
> the character associated with each number. In the ASCII table, it's
> listed as a backward quote. In the ISO 8859 table, it's listed as a
> grave accent.

According to the source referenced above, in original ASCII it's an "apostrophe,
or close quotation" when used as punctuation, and an "acute accent" when used as
a diacritical mark. Original ASCII represented diacritical marks by backspacing,
i.e. the visual effect of char + BS + mark on a printer. This convention did not
survive, however, and in later usage as well as in Latin-1 it's punctuation.

You might also wish to consult the Unicode standard's reference document on the
ASCII subset of Unicode.

And in that connection note that Unicode contains Latin-1 (ISO 8859-1) as a
subset, overlapping with ASCII, that is, the same code points...

>> In passing, note that the error in your reasoning started with the "obviously";
>> that little code-word often signals an error of reasoning.
>
> There was no error in reasoning.

There certainly was, and still is.

>> And in case you doubt that quoting is fair use, note that in this group we often
>> quote from the C++ standard -- perhaps you have done so yourself, earlier?
>
> I have no problem with fair use, or quoting relevant portions. In this
> case, there's no other explanatory text, so I've already quoted
> everything I can find that's relevant.

The ASCII standard has/had explanatory text.

It sounds to me like you're referring to just some code chart that someone
labeled "ASCII".

> [ ... ]
>
>> Assuming for the sake of argument that the two standards use different terms to
>> describe character 96, since it seems you're reluctant to offer any evidence,
>> almost as if the implication that you have these standards wasn't true.
>
> I'm not sure what further evidence would be relevant -- I've already
> quoted what each says on the subject. Neither appears to have anything
> beyond the single-word description of that particular character.

It seems you don't have the standards.

They have much more than code charts.

>> Is your point that the two standards use different terms for the same thing?
>>
>> In that case either your argument earlier in the thread was misleading, or your
>> current argument is misleading.
>>
>> Or is your point that the two standards use different terms with the intention
>> to denote two different things?
>>
>> In that case you have misunderstood the standards.
>
> "I'm sorry, but as far as I know, that's BS."
>
> It seems quite incredible for you to claim certainty about the intent of
> the standard, especially one that you've apparently never even seen.

Don't flout inane accusations on top of obstinate wrong-headedness and a
ridiculous claim.

You have a misunderstood whatever material you have, and you haven't understood
that Latin-1 is a direct extension of ASCII (sans control characters), and that
Unicode is a direct extension of Latin-1 -- which is what you need to grasp.

Jerry Coffin

unread,

May 9, 2009, 9:45:09 PM5/9/09

to

In article <gu4s40$fe9$1...@news.motzarella.org>, al...@start.no says...

[ ... ]

> You have as yet not quoted anything, at least not to me.

Yes, I did. When I said the ISO standard describes the character as a
grave accent, that was a direct quote from the standard -- it's also
_all_ the standard says about that character.

[ ... ]

> The original final ASCII standard from 1967 is no longer available so I'm
> surprised you claim to have it.

As it happens, we needed a copy at work a few years ago, so we had a
couple of people working for a week or so to find it. As I recall, the
copy we found was at a university in Australia, from which we got a
Xeroxed copy.

BTW, you seem to have rather a problem with the date there as well --
the _original_ final ASCII standard was in 1963. The 1967 version was a
revision. There was also a 1968 revision, and as I understand it, a 1986
version as well (though I've never seen a copy of the latter). The
changes from the 1967 to '68 standards were quite minimal though.

The ASCII standard only gave extremely minimal descriptions of the
control characters as well. The ISO did publish a separate document
(roughly what would now be called a TR) giving (somewhat) more detailed
description of the control characters -- but not of the printable
characters.

> As an alternative you might take a look at
>
> <url: http://wps.com/projects/codes/Revised-ASCII/page3.JPG>.
>
>
> > Each has a table of numbers and
> > the character associated with each number. In the ASCII table, it's
> > listed as a backward quote. In the ISO 8859 table, it's listed as a
> > grave accent.
>
> According to the source referenced above, in original ASCII it's an "apostrophe,
> or close quotation" when used as punctuation, and an "acute accent" when used as
> a diacritical mark.

You're not even looking at the right character. The character under
discussion is a couple of lines down, the 6/0 rather than 2/7. In any
case, this seems to be from somebody else's understanding of ASCII, not
from the standard itself.

[ ... ]

> There certainly was, and still is.

You haven't shown any mistake in reasoning yet. In fact, you haven't
even figured out which character is being discussed yet, and you've
shown nothing to indicate that you've looked at the original source
either.

[ ... ]

> The ASCII standard has/had explanatory text.

Yes, some -- but not for the character in question.

OTOH (working to get back to something topical), it does contain
explanatory text showing that the use of "new line" in C and C++ really
does come directly from ASCII:

In the definition of LF (page 8):
Where appropriate, this character may have the meaning
"New Line" (NL), a format effector which controls the
movement of the printing point to the first printing
position on the next printing line. Use of this
convention requires agreement between the sender and
recipient of data.

and in Appendix A, section A7.6:

The function "New Line" (NL) was associated with the LF
(rather than with CR or with a separate character) to
provide the most useful combination of functions through
the use of only two character positions, and to allow the
use of a common end-of-line format for both printers
having separate CR-LF functions and those having a
combined (i.e., NL) function. This sequence would be
CR-LF, producing the same result on both classes, and
would be useful during conversion of a system from one
method of operation to the other.

I believe this interpretation of LF was new in the 1968 version of the
standard.

If you're interested in the history of ASCII and its standardization,
you might want to look at Bob Bemer's web site, at:

http://www.trailing-edge.com/~bobbemer/index.htm

Jerry Coffin

unread,

May 9, 2009, 10:42:23 PM5/9/09

to

Doing a bit of looking, I found a web site that has a bit of interesting
history about some of the characters in ASCII, ISO 8859, Unicode, ISO
10646, and so on.

http://www.cs.tut.fi/~jkorpela/latin1/ascii-hist.html

I certainly can't vouch for everything he says being absolutely
accurate, but everything I've seen in it looks pretty reasonable and he
gives references for nearly everything. Unless essentially _everything_
he says is wrong, he demonstrates the point I was originally trying to
make quite well -- while the newer standards largely _attempt_ to act as
proper supersets of ASCII, there are enough variations between early
character sets (e.g. between US-ASCII and ISO 646) that this isn't
always entirely possible.

Interestingly, he has a rather lengthy piece about the character I
mentioned (opening single quote mark / grave accent). This seems to show
exactly HOW things got the way they are. It was originally proposed in
the ISO committee as a grave accent. The US committee then overloaded it
to be an opening single quote. My guess is that ISO 8859 was written
primarily (if not exclusively) as a superset of ISO 646, so they simply
ignored the American aberration of calling it an opening single quote.

A number of other characters (including the right single
quote/apostrophe to which Alf referred) have slightly differing
definitions between different standards as well. The first (1963) ASCII
standard referred to it purely as an apostrophe. Later versions added
the notations of Closing Single Quotation mark and Acute Accent. The
current versions of ISO 646, 8859 and 10646 have gone back to the
original and refer to it only as an apostrophe.

The available evidence suggests that Alf's accusations were and are
unfounded -- the definitions associated with a number of code points
have varied between standards, but this has neither led to any
significant incompatibility nor rendered any of the standards
particularly impractical. At the same time, even in the simplest of
plain text, using only 7-bit characters, there are variations in the
interpretations of a few code points.

Tony

unread,

May 9, 2009, 7:06:52 PM5/9/09

to

Richard Herring wrote:
> In message <UWOMl.18752$8_3....@flpi147.ffdc.sbc.com>, Tony
> <to...@my.net> writes
>> Jerry Coffin wrote:
>
> [...]
>>>
>>> It's a bit hard to say much about ASCII per se -- the standard has
>>> been obsolete for a long time. Even the organization that formed it
>>> doesn't exist any more.
>>
>> Oh? Is that why such care was taken with the Unicode spec to make
>> sure that it mapped nicely onto ASCII?
>
> Or ISO-8859?
>
> [...]
>>
>>>
>>>> The English alphabet has 26 characters. No more, no less.
>>>
>>> Unfortunately statements like this weaken your point. By any
>>> reasonable measure, the English alphabet contains at least 26
>>> characters (upper and lower case).
>>
>> Fine, upper and lower case then. But no umlauts or accent marks!
>
> How na�ve. My _English_ dictionary includes d�j� vu, g�teau and many
> other words with diacritics.

And how many variable names do you create with those foreign glyphs? Hmm?

>
>>
>>> Of course, even other western
>>> Euroean languages like French and German require characters that
>>> aren't present in the English alphabet, and the last I heard there
>>> were also at least a _few_ people in places like China, Korea,
>>> Japan, the Arabian Peninsula, etc. -- and most of them use
>>> languages in which the characters aren't even similar to those in
>>> English.
>>
>> That passage seems non-sequitur: the whole gist was "what if one has
>> established that English is an appropriate simplifying assumption?".
>
> Then one still needs some diacritics. The ISO-8859 family has them;
> ASCII doesn't.

The issue here is not Webster's Dictionary.

Tony

unread,

May 9, 2009, 8:28:16 PM5/9/09

to

James Kanze wrote:
> On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
>> James Kanze wrote:
>>> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
>>>> James Kanze wrote:
>>>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
>>>>>> James Kanze wrote:
>
>>>>>> 7-bit ASCII is your friend. OK, not *your* friend maybe,
>>>>>> but mine for sure!
>
>>>>> 7-bit ASCII is dead, as far as I can tell. Certainly none
>>>>> of the machines I use use it.
>
>>>> It's an application-specific thing, not a machine-specific
>>>> thing.
>
>>> That's true to a point---an application can even use EBCDIC,
>>> internally, on any of these machines. In practice, however,
>>> anything that leaves the program (files, printer output,
>>> screen output) will be interpreted by other programs, and an
>>> application will only be usable if it conforms to what these
>>> programs expect.
>
>> But there is a huge volume of programs that can and do use
>> just ASCII text.
>
> There is a huge volume of programs that can and do use no text.
> However, I don't know of any program today that uses text in
> ASCII;

You must be thinking of shrink-wrap-type user-interactive programs rather
than in-house development tools, for example.

> text is used to communicate with human beings, and ASCII
> isn't sufficient for that.

Millions of posts on USENET seem to contradict that statement.

>
>> I gave the example of development tools: parsers, etc.
>
> Except that the examples are false. C/C++/Java and Ada require
> Unicode.

To be general they do. One could easily eliminate that requirement and still
get much work done. I'm "arguing" not against Unicode, but that the ASCII
subset, in and of itself, is useful.

> Practically everything on the network is UTF-8.

> Basically, except for some historical tools, ASCII is dead.

Nah, it's alive and well, even if you choose to call it a subset of
something else. Parse all of the non-binary group posts and see how many
non-ASCII characters come up (besides your tagline!).

>
>> Sure, the web isn't just ASCII, but that is just an
>> application domain. If that is the target, then I'll use
>> UnicodeString instead of ASCIIString. I certainly don't want
>> all the overhead and complexity of Unicode in ASCIIString
>> though. It has too many valid uses to have to be bothered with
>> a mountain of unnecessary stuff if being subsumed into the
>> "one size fits all" monstrosity.
>
> As long as you're the only person using your code, you can do
> what you want.

person, or company, or group, or alliance all work. Standards were meant to
be... ignored (there's always a better way)! ;)

>
>>> Which isn't necessarily a trivial requirement.
>
>> On that we agree 100%! That's the rationale for keeping
>> ASCIIString unaberrated.
>
> I understand the rationale.
>
>>> When I spoke of the encodings used on my machines, I was
>>> refering very precisely to those machines, when I'm logged
>>> into them, with the environment I set up. Neither pure
>>> ASCII nor EBCDIC are options, but there are a lot of other
>>> possibilities. Screen output depends on the font being used
>>> (which as far as I know can't be determined directly by a
>>> command line program running in an xterm), printer output
>>> depends on what is installed and configured on the printer
>>> (or in some cases, the spooling system), and file output
>>> depends on the program which later reads the file---which
>>> may differ depending on the program, and what they do with
>>> the data. (A lot of programs in the Unix world will use
>>> $LC_CTYPE to determine the encoding---which means that if
>>> you and I read the same file, using the same program, we may
>>> end up with different results.)
>
>> I don't get what you mean: an ASCII text file is still an
>> ASCII text file no matter what font the user chooses in
>> Notepad, e.g.
>
> First, there is no such thing as an ASCII text file.

Then what is a file that contains only ASCII printable characters (throw in
LF and HT for good measure)?

> For that
> matter, under Unix, there is no such thing as a text file. A
> file is a sequence of bytes.

And if the file is opened in text mode?

> How those bytes are interpreted
> depends on the application.

So the distinction between text and binary mode is .... ?

>> Internally, the program is still working with ASCII strings,
>> assuming English is the language (PURE English that recognizes
>> only 26 letters, that is).
>

> Pure English has [...]

_I_ was giving the definition of "Pure English" in the context (like a
glossary). How many letters are there in the English alphabet? How many?
Surely I wasn't taught umlauts in gradeschool. You are arguing semantics and
I'm arguing practicality: if I can make a simplifying assumption, I'm gonna
do it (and eval that assumption given the task at hand)!

> accented characters in some words (at least
> according to Merriam Webster, for American English). Pure
> English distiguishes between open and closing quotes, both
> single and double. Real English distinguishes between a hyphen,
> an en dash and an em dash.
>
> But that's all irrelevant, because in the end, you're writing
> bytes, and you have to establish some sort of agreement between
> what you mean by them, and what the programs reading the data
> mean. (*If* we could get by with only the characters in
> traditional ASCII, it would be nice, because for historical
> reasons, most of the other encodings encountered encode those
> characters identically. Realistically, however, any program
> dealing with text has to support more, or nobody will use it.)
>
>> Nor does it matter that the platform is Wintel where "behind
>> the scenes" the OS is all UTF-16.
>
>>>>> My (very ancient)
>
>> (Aside Trivia: The "failure" of Sun has been attributed in
>> part to the unwillingness to move to x86 while "the industry"
>> went there. Very ancient indeed!).
>
> Where did you get that bullshit?

This week's trade rags (it's still around here, so if you want the exact
reference, just ask me). It makes sense too: Apple moved off of PowerPC also
probably to avoid doom. I'm a Wintel developer exclusively right now also,
so it makes double sense to me.

> Sun does sell x86 processors
> (using the AMD chip). And IBM and HP are quite successful with
> there lines of non x86 processors. (IMHO, where Sun went wrong
> was in abandoning its traditional hardware market, and moving
> into software adventures like Java.)

Topic for another thread for sure (those kinds of threads are fun, but don't
result in anything useful). What you said parenthetically above, I kinda
agree with: Open Solaris looked like a winner to me until they made it
subserviant to Java (a platform to push Java). Dumb Sun move #2. (But I only
track these things lightly on the surface).

>
>>>>> Sparcs use ISO 8859-1, my Linux boxes UTF-8, and Windows
>>>>> UTF-16LE.
>
>>>>> The reason is simple, of course: 7-bit ASCII (nor ISO
>>>>> 8859-1, for that matter) doesn't suffice for any known
>>>>> language.
>
>> The application domain you reference is: Operating System.
>> Quite different from CSV text file parser.
>
> I'm not referencing any application domain in particular.

Apparently you referenced OSes a few times.

> Practically all of the Unix applications I know take the
> encoding from the environment; those that don't use UTF-8 (the
> more recent ones, anyway). All of the Windows applications I
> know use UTF-16LE.
>
> Do you think anyone would use MS Office or Open Office if they
> only supported ASCII?

I was talking about simpler class of programs and libraries even: say, a
program's options file and the ini-file parser (designated subset of 7-bit
ASCII).

Apparently there is a semantic gap in our "debate". I'm not sure where it
is, but I think it may be in that you are talking about what goes on behind
the scenes in an OS, for example, and I'm just using the simple ini-file
parser using some concoction called ASCIIString as the workhorse.

>
>> Your statement could be misleading even if you didn't intend
>> it to be. The "any known language.. blah, blah", is a
>> generalization that fits the real world,
>
> Yes. That's where I live and work. In the real world. I
> produce programs that other people use. (In practice, my
> programs don't usually deal with text, except maybe to pass it
> through, so I'm not confronted with the problem that often. But
> often enough to be aware of it.)

You opportunistically took that out of context. I was alluding toward the
difference between the problem domain (the real world) and the solution
domain (technology).

>
>> but software programs eventually are just "zeros and ones".
>
> Not really.

Well you snipped off the context so I don't know how I meant that.

> Programs assign semantics to those ones and zeros.
> Even at the hardware level---a float and an int may contain the
> same number of bits, but the code uses different instructions
> with them. Programs interpret the data.
>
> Which brings us back to my point above---you don't generally
> control how other programs are going to interpret the data you
> write.

If you say so. But if I specify that ini-files are for my program may
contain only the designated subset of 7-bit ASCII, and someone puts an
invalid character in there, expect a nasty error box popping up.

>
>> The above from you is an odd perspective noting that in
>> another thread you were trying to shoehorn something with,
>> logically, magnitude and direction into a signed integral
>> type.
>
> Sorry, I don't know what you're talking about.

Nevermind. It just seemed like you were arguing both sides of the point in
the two threads combined.

>
>>>> Um, how about the C++ programming language!
>
>>> C++ accepts ISO/IEC 10646 in comments, string and character
>>> literals, and symbol names.
>
>> That's a good expansion point. Let's look the constituents...
>
>> Comments and Symbols: If you want to program in French or
>> 7-bit kanji (The Matrix?), have at it.
>
> I've already had to deal with C with the symbols in Kanji.

So use it once and then jettison all simpler things? The C/C++ APIs are
overly-general (IMO) that's why I don't use them unless the situation
warrants it. Generality makes complexity. Every developer should know how to
implement a linked list, for example. Every developer should have a number
of linked lists he uses, as having only one design paradigm ensures every
program/project is a compromise. IMO. YMMV.

> That
> would have been toward the end of the 1980s. And I haven't seen
> a program in the last ten years which didn't use symbols and
> have comments in either French or German.

But you're in/from France right? Us pesky "americans" huh. ;)

>
>> I guarantee you that I'll never ever use/need 10646 comments
>> or symbols.
>
> Fine. If you write a compiler, and you're the only person to
> use it, you can do whatever you want. But there's no sense in
> talking about it here, since it has no relevance in the real
> world.

You're posting in extremism to promote generalism? Good engineering includes
exploiting simplifying assumptions (and avoiding the hype, on the flip
side). (You'd really put non-ASCII characters in source code comments?
Bizarre.)

Most programs don't need to be international. Data and development tools are
not the same.

>
>> I'll be nice and call it a simplifying assumption but it's
>> really a "no brainer".
>
>> Literals: Not a problem for me, and can be worked around for
>> others (put in file or something: make it data because that's
>> what it is. Programming in French is hard).
>
> No it's not.

Well it would be for me! So yes it is!

> (Actually, the most difficult language to program
> in is English,

Not for me! Context matters! (I was the context, along with many other
developers here).

>
>> Major advantage for me in programming: English is my primary
>> language!
>
> It's one of my primarly languages as well. Not the only one,
> obviously, but one of them.

"primarly" (hehe ;) ). "A set of primary languages?". One primary or none
probably. (None is as good as one, I'm not dissing... I only know two
languages and a third ever so lightly for "I took it in HS").

>
>>>>> Of course, I'm talking here about real programs, designed to
>>>>> be used in production environments. If your goal is just a
>>>>> Sudoku solver, then 7-bit ASCII is fine.
>
>>>> Of course compilers and other software development tools
>>>> are just toys. The English alphabet has 26 characters. No
>>>> more, no less.
>
>>> C, C++, Java and Ada all accept the Unicode character set,
>>> in one form or another.
>
>> There's that operating system example again that doesn't apply
>> to hardly all application development.
>
> That has nothing to do with the operating system. Read the
> language standards.

Ah ha! The golden calf. I had a feeling there was a god amongst us. :/

I'm not "big" on "standards". (Separate thread!).

>
>>> (Ada, and maybe Java, limit it to the first BMP.) I would
>>> think that this is pretty much the case for any modern
>>> programming language.
>
>> You are interfusing programming languages with the data that
>> they manipulate.
>
> No. Do you know any of the languages in question? All of them
> clearly require support for at least the first BMP of Unicode in
> the compiler. You may not use that possibility---a lot of
> people don't---but it's a fundamental part of the language.

THAT _IS_ the point (!): if a program (or other) doesn't require it, then it
is just CHAFF. This ever-expoused over-generality and
general-is-good-and-always-better gets very annoying in these NGs. Save the
committe stuff for c.l.c++.moderated or the std group. The chaff is probably
holding back practicality for those who can't distinquish politics.

Alf P. Steinbach

unread,

May 10, 2009, 2:34:34 AM5/10/09

to

* Jerry Coffin:

> In article <gu4s40$fe9$1...@news.motzarella.org>, al...@start.no says...
>
> [ ... ]
>
>> You have as yet not quoted anything, at least not to me.
>
> Yes, I did. When I said the ISO standard describes the character as a
> grave accent, that was a direct quote from the standard

A quote is indicated by quoting.

Descriptions about something are not quotes.

You did not quote and you said you quoted.

> -- it's also
> _all_ the standard says about that character.

I doubt it.

Anyways, you're wrong and really don't know what you're talking about.

ASCII (sans control chars) is a proper subset of Latin-1, with the same code
points. There's no difference. You snipped my suggestion that you look up the
Unicode standard's separate document on its ASCII subset, but what the heck, I
just suggest it again.

> [ ... ]
>
>> The original final ASCII standard from 1967 is no longer available so I'm
>> surprised you claim to have it.
>
> As it happens, we needed a copy at work a few years ago, so we had a
> couple of people working for a week or so to find it. As I recall, the
> copy we found was at a university in Australia, from which we got a
> Xeroxed copy.

:-)

Who do you think you're kidding?

> BTW, you seem to have rather a problem with the date there as well --
> the _original_ final ASCII standard was in 1963. The 1967 version was a
> revision. There was also a 1968 revision, and as I understand it, a 1986
> version as well (though I've never seen a copy of the latter). The
> changes from the 1967 to '68 standards were quite minimal though.

If you're referring to the 1963 standard, it /lacked lowercase letters/.

Are you *really* suggesting that was the final standard?

ROTFL. :-) :-) :-)

Bye, for this topic at least, :-)

- ALf

Jerry Coffin

unread,

May 10, 2009, 2:45:52 AM5/10/09

to

In article <gu5slm$qp3$1...@news.motzarella.org>, al...@start.no says...

[ ... ]

> Bye, for this topic at least, :-)

IOW, you've seen the next follow-up I posted that showed in detail that
all the claims you've made in this thread were complete nonsense from
beginning to end!

James Kanze

unread,

May 10, 2009, 7:28:24 AM5/10/09

to

On May 9, 7:01 pm, "Alf P. Steinbach" <al...@start.no> wrote:
> * Jerry Coffin:

> > In article <gu47qc$s2...@news.motzarella.org>, al...@start.no says...

> > [ ... ]

> >> I'm sorry but as far as I know that's BS. :-)

> > Have you looked at both specifactions to find out? Have you
> > even looked at one of them?

> >> Would be nice to know where you picked up that piece of
> >> disinformation, though.

> > It would be nice to know exactly what convinces you that
> > it's disinformation, and particularly whether you have any
> > authoritative source for the claim. Wikipedia certainly
> > doesn't qualify, and as much respect as I have to James, I
> > don't think he does either. It would appear to me that the
> > only authoritative sources on the subject are the standards
> > themselves -- and your statement leads me to doubt that
> > you've consulted them in this case.

Obviously. I'm not the author of the standard, and I don't
actually have access to the text of any of them except Unicode.

From experience, I can say that all of the implementations of
the Unix shells I know (from Bourne on through ksh and bash)
treat the character encoded 96 in the same way, regardless of
the encoding used. (The original Bourne shell used ASCII---and
added internal information on the eighth bit. The others use
the encoding specified by the LC_CTYPE environment variable,
which may be any of the ISO 8859 encodings, or UTF-8.)

Perhaps a more accurate specification of my claim is that source
files written in ASCII could still be read by programs using one
of the ISO 8859 encodings or UTF-8. At least under Unix. As
for the "goals" of the various standards, I do think that they
were more along the lines of interoperatability, rather than
exact identity.

> You're reversing the burden of evidence.

> You made an outrageous claim, which if it were true would make
> ISO 8859-1 a very impractical standard; now please submit your
> evidence that you think is in favor of that claim.

I think that his evidence is clear: the official standards of
each encoding. (At least, that seems to me to be what he is
implying.) A look at the one line version of ISO 8859-1
confirms what he has said about that. I can't find the ASCII
standard on line, but I've spoken with Jerry personally, and
from what I know of his work, it seems reasonable to assume that
he actually does have access to the standard (which I don't), so
I'll take him on his word for it (unless someone else can post
an actual quote from the standard, contradicting what he's
said).

James Kanze

unread,

May 10, 2009, 7:50:10 AM5/10/09

to

On May 10, 2:28 am, "Tony" <t...@my.net> wrote:
> James Kanze wrote:
> > On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
> >> James Kanze wrote:
> >>> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
> >>>> James Kanze wrote:
> >>>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
> >>>>>> James Kanze wrote:
> > There is a huge volume of programs that can and do use no
> > text. However, I don't know of any program today that uses
> > text in ASCII;

> You must be thinking of shrink-wrap-type user-interactive
> programs rather than in-house development tools, for example.

No. None of the in house programs I've seen use ASCII, either.

> > text is used to communicate with human beings, and ASCII
> > isn't sufficient for that.

> Millions of posts on USENET seem to contradict that statement.

In what way. The USENET doesn't require, or even encourage
ASCII. My postings are in either ISO 8859-1 or UTF-8, depending
on the machine I'm posting from. I couldn't post them in ASCII,
because they always contain accented characters.

> >> I gave the example of development tools: parsers, etc.

> > Except that the examples are false. C/C++/Java and Ada
> > require Unicode.

> To be general they do. One could easily eliminate that
> requirement and still get much work done. I'm "arguing" not
> against Unicode, but that the ASCII subset, in and of itself,
> is useful.

It's certainly useful, in certain limited contexts. Until
you've seen a BOM or an encoding specification, for example, in
XML. (Although technically, it's not ASCII, but the common
subset of UTF-8 and the ISO 8859 encodings.)

> > Practically everything on the network is UTF-8. Basically,
> > except for some historical tools, ASCII is dead.

> Nah, it's alive and well, even if you choose to call it a
> subset of something else. Parse all of the non-binary group
> posts and see how many non-ASCII characters come up (besides
> your tagline!).

Just about every posting, in some groups I participate in.

> >> I don't get what you mean: an ASCII text file is still an
> >> ASCII text file no matter what font the user chooses in
> >> Notepad, e.g.
>
> > First, there is no such thing as an ASCII text file.

> Then what is a file that contains only ASCII printable
> characters (throw in LF and HT for good measure)?

A file that doesn't exist on any of the machines I have access
to.

At the lowest level, a file is just a sequence of bytes (under
Unix or Windows, at least). At that level, text files don't
exist. It's up to the programs reading or writing the file to
interpret those bytes. And none of the programs I use interpret
them as ASCII.

> > For that matter, under Unix, there is no such thing as a
> > text file. A file is a sequence of bytes.

> And if the file is opened in text mode?

It depends on the imbued locale. (Text mode or not.)

> > How those bytes are interpreted depends on the application.

> So the distinction between text and binary mode is .... ?

Arbitrary. It depends on the system. Under Unix, there isn't
any. Under Windows, it's just the representation of '\n' in the
file. Under other OS's, it's usually a different file type in
the OS (and a file written in text mode can't be opened in
binary, and vice versa).

> >> Internally, the program is still working with ASCII strings,
> >> assuming English is the language (PURE English that recognizes
> >> only 26 letters, that is).

> > Pure English has [...]

> _I_ was giving the definition of "Pure English" in the context
> (like a glossary). How many letters are there in the English
> alphabet? How many?

The same as in French, German or Italian: 26. However, in all
four of these languages, you have cases where you need accents,
which are made by adding something to the representation of the
letter (and require a distinct encoding for the computer)---in
German, there is even a special case of ß, which can't be made
by just adding an accent (but which still isn't a letter).

> Surely I wasn't taught umlauts in gradeschool.

I was taught to spell naïve correctly (although I don't know if
it was grade school or high school).

> You are arguing semantics and I'm arguing practicality: if I
> can make a simplifying assumption, I'm gonna do it (and eval
> that assumption given the task at hand)!

[...]

> >> (Aside Trivia: The "failure" of Sun has been attributed in
> >> part to the unwillingness to move to x86 while "the industry"
> >> went there. Very ancient indeed!).

> > Where did you get that bullshit?

> This week's trade rags (it's still around here, so if you want
> the exact reference, just ask me). It makes sense too: Apple
> moved off of PowerPC also probably to avoid doom. I'm a Wintel
> developer exclusively right now also, so it makes double sense
> to me.

Whatever? The fact remains that 1) Sun does produce processors
with Intel architecture---the choice is up to the customer, and
2) Sun and Apple address entirely different markets, so a
comparison isn't relevant. (The ability to run MS Office on a
desktop machine can be a killer criterion. The ability to run
it on a server is totally irrelevant.)

[...]

> > Do you think anyone would use MS Office or Open Office if they
> > only supported ASCII?

> I was talking about simpler class of programs and libraries
> even: say, a program's options file and the ini-file parser
> (designated subset of 7-bit ASCII).

> Apparently there is a semantic gap in our "debate". I'm not
> sure where it is, but I think it may be in that you are
> talking about what goes on behind the scenes in an OS, for
> example, and I'm just using the simple ini-file parser using
> some concoction called ASCIIString as the workhorse.

All of the ini-files I've see do allow accented characters.

> > Programs assign semantics to those ones and zeros.
> > Even at the hardware level---a float and an int may contain the
> > same number of bits, but the code uses different instructions
> > with them. Programs interpret the data.

> > Which brings us back to my point above---you don't generally
> > control how other programs are going to interpret the data you
> > write.

> If you say so. But if I specify that ini-files are for my
> program may contain only the designated subset of 7-bit ASCII,
> and someone puts an invalid character in there, expect a nasty
> error box popping up.

As long as you're the only user of your programs, that's fine.
Once you have other users, you have to take their desires into
consideration.

> > That would have been toward the end of the 1980s. And I
> > haven't seen a program in the last ten years which didn't
> > use symbols and have comments in either French or German.

> But you're in/from France right? Us pesky "americans" huh. ;)

Sort of:-). My mother was American, and I was born and raised in
the United States. My father was German, my wife's Italian, and
I currently live in France (but I've also lived a lot in
Germany). And yes, I do use four languages on an almost daily
basis, so I'm somewhat sensitivized to the issue. But I find
that even when working in an English language context, I need
more than just ASCII. And I find that regardless of what I
need, the machines I use don't even offer ASCII as a choice.

> >> I guarantee you that I'll never ever use/need 10646 comments
> >> or symbols.

> > Fine. If you write a compiler, and you're the only person to
> > use it, you can do whatever you want. But there's no sense in
> > talking about it here, since it has no relevance in the real
> > world.

> You're posting in extremism to promote generalism? Good
> engineering includes exploiting simplifying assumptions (and
> avoiding the hype, on the flip side). (You'd really put
> non-ASCII characters in source code comments? Bizarre.)

I have to, because my comments where I work now have to be in
French, and French without accents is incomprehensible. The
need is less frequent in English, but it does occur.

Richard Herring

unread,

May 11, 2009, 4:56:05 AM5/11/09

to

In message <rqrNl.18828$8_3....@flpi147.ffdc.sbc.com>, Tony
<to...@my.net> writes

>Richard Herring wrote:
>> In message <UWOMl.18752$8_3....@flpi147.ffdc.sbc.com>, Tony
>> <to...@my.net> writes
>>> Jerry Coffin wrote:
>>
>> [...]
>>>>
>>>> It's a bit hard to say much about ASCII per se -- the standard has
>>>> been obsolete for a long time. Even the organization that formed it
>>>> doesn't exist any more.
>>>
>>> Oh? Is that why such care was taken with the Unicode spec to make
>>> sure that it mapped nicely onto ASCII?
>>
>> Or ISO-8859?
>>
>> [...]
>>>
>>>>
>>>>> The English alphabet has 26 characters. No more, no less.
>>>>
>>>> Unfortunately statements like this weaken your point. By any
>>>> reasonable measure, the English alphabet contains at least 26
>>>> characters (upper and lower case).
>>>
>>> Fine, upper and lower case then. But no umlauts or accent marks!
>>
>> How na�ve. My _English_ dictionary includes d�j� vu, g�teau and many
>> other words with diacritics.
>
>And how many variable names do you create with those foreign glyphs? Hmm?

Who cares? I'm merely providing a counterexample to your sweeping claim
that the English alphabet has exactly 26 characters. Or even 52.

--
Richard Herring

Tony

unread,

May 13, 2009, 4:58:57 PM5/13/09

to

James Kanze wrote:
> On May 10, 2:28 am, "Tony" <t...@my.net> wrote:
>> James Kanze wrote:
>>> On May 8, 3:02 am, "Tony" <t...@my.net> wrote:

>>>> (Aside Trivia: The "failure" of Sun has been attributed in
>>>> part to the unwillingness to move to x86 while "the industry"
>>>> went there. Very ancient indeed!).
>
>>> Where did you get that bullshit?
>
>> This week's trade rags (it's still around here, so if you want
>> the exact reference, just ask me). It makes sense too: Apple
>> moved off of PowerPC also probably to avoid doom. I'm a Wintel
>> developer exclusively right now also, so it makes double sense
>> to me.
>
> Whatever? The fact remains that 1) Sun does produce processors
> with Intel architecture---the choice is up to the customer,

Not enough emphasis on and too late (2003) to the x86 party, so it is being
said. What Sun was doing at the time of the Oracle buyout is irrelevant.
What is relevant is the history of the company and the strategic decisions
that were (or weren't!) made, for they are what led to the company's
instability.

> and
> 2) Sun and Apple address entirely different markets, so a
> comparison isn't relevant.

No one was comparing Sun and Apple: I was "hinting" at the fact that x86 is
still growing in it's ubiquitousness. It is suggested by analysts, as I
originally noted, that Sun's decision making regarding x86 vs. it's own
Sparc was a major strategic mistake.

I'm just regurgitating what the industry analysts are saying; I find it
interesting to read/study product and company lifecycles and strategies.

Tony

unread,

May 14, 2009, 11:34:33 PM5/14/09

to

I meant letters, not characters. It should be obvious from the CONTEXT ("eye
on the ball" people!) that was what I meant. Perhaps you are trying
opportunistically to imply something different.

Tony

unread,

May 14, 2009, 11:41:35 PM5/14/09

to

Alf P. Steinbach wrote:

> ASCII (sans control chars) is a proper subset of Latin-1,

Since ASCII preceded "Latin-1" (ISO 8859-1), it would be more correct to say
that "Latin-1" is a superset of ASCII. ASCII is the basis of modern
character encodings.

Tony

unread,

May 14, 2009, 11:42:18 PM5/14/09

to

Jerry Coffin wrote:

> BTW, you seem to have rather a problem with the date there as well --
> the _original_ final ASCII standard was in 1963. The 1967 version was
> a revision. There was also a 1968 revision, and as I understand it, a
> 1986 version as well (though I've never seen a copy of the latter).
> The changes from the 1967 to '68 standards were quite minimal though.

1986? Really? What happened in 1986? I thought the ASCII timeline stopped at
1983.

Tony

unread,

May 15, 2009, 12:40:53 AM5/15/09

to

James Kanze wrote:
> On May 10, 2:28 am, "Tony" <t...@my.net> wrote:
>> James Kanze wrote:
>>> On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
>>>> James Kanze wrote:
>>>>> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
>>>>>> James Kanze wrote:
>>>>>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
>>>>>>>> James Kanze wrote:
>>> There is a huge volume of programs that can and do use no
>>> text. However, I don't know of any program today that uses
>>> text in ASCII;
>
>> You must be thinking of shrink-wrap-type user-interactive
>> programs rather than in-house development tools, for example.
>
> No. None of the in house programs I've seen use ASCII, either.
>
>>> text is used to communicate with human beings, and ASCII
>>> isn't sufficient for that.
>
>> Millions of posts on USENET seem to contradict that statement.
>
> In what way. The USENET doesn't require, or even encourage
> ASCII.

But the underlying protocol is NNTP, and while I don't know for sure, I have
an incling that it is still a 7-bit protocol (?). But that wasn't my point.
I was suggesting that most USENET posts in threaded discussion groups are
ASCII (by nature of the characters in use by the posts).

> My postings are in either ISO 8859-1 or UTF-8, depending
> on the machine I'm posting from.

You can call it what you want, but if it contains only ASCII characters,
then I consider it an ASCII post.

> I couldn't post them in ASCII,
> because they always contain accented characters.

And that's your perogative. It's not English though and it introduces
complexity where it is not necessary. Claiming that unnaturalized words are
rationale for "Unicode everywhere" is ludicrous (for lack of a better word
that escapes my mind right now).

>
>>>> I gave the example of development tools: parsers, etc.
>
>>> Except that the examples are false. C/C++/Java and Ada
>>> require Unicode.
>
>> To be general they do. One could easily eliminate that
>> requirement and still get much work done. I'm "arguing" not
>> against Unicode, but that the ASCII subset, in and of itself,
>> is useful.
>
> It's certainly useful, in certain limited contexts.

"limited" is contextual. If a product has "only" 1% market share but has
billions of dollars in sales, is it irrelevant?

> Until
> you've seen a BOM or an encoding specification, for example, in
> XML. (Although technically, it's not ASCII, but the common
> subset of UTF-8 and the ISO 8859 encodings.)

Use the appropriate tool for the job. No more, no less. (That concept seems
to escape language library comittees).

>
>>> Practically everything on the network is UTF-8. Basically,
>>> except for some historical tools, ASCII is dead.
>
>> Nah, it's alive and well, even if you choose to call it a
>> subset of something else. Parse all of the non-binary group
>> posts and see how many non-ASCII characters come up (besides
>> your tagline!).
>
> Just about every posting, in some groups I participate in.

You mean the header encoding or transformation encoding field? Parse just
the message, not the header designations. One could understand "some groups"
in your context: you work for a "foreign" (English is the second language)
company or something right? Well duh, then.

>
>>>> I don't get what you mean: an ASCII text file is still an
>>>> ASCII text file no matter what font the user chooses in
>>>> Notepad, e.g.
>>
>>> First, there is no such thing as an ASCII text file.
>
>> Then what is a file that contains only ASCII printable
>> characters (throw in LF and HT for good measure)?
>
> A file that doesn't exist on any of the machines I have access
> to.

Bah. Enough of your banter/babbling on this. It's a waste of my time.

>
> At the lowest level, a file is just a sequence of bytes (under
> Unix or Windows, at least).

So?

> At that level, text files don't
> exist.

So?

> It's up to the programs reading or writing the file to
> interpret those bytes.

Yes (So?).

> And none of the programs I use interpret
> them as ASCII.

So?

(Is there a point you have in all that?? Oh, that though the files may
contain only 7-bit ASCII characters, there is some relevance in the
supersetting UTF-16/UTF-8 being used by the OS? That's NO point! I can use a
Caterpillar belt-driven tractor with a 3406 diesel in it to work my 10 acre
farm (or buy one to do so), but surely I'd be labeled "eccentric" or worse).

>
>>> For that matter, under Unix, there is no such thing as a
>>> text file. A file is a sequence of bytes.
>
>> And if the file is opened in text mode?
>
> It depends on the imbued locale. (Text mode or not.)

My point was made just above. No need to drag locales into the discussion.
(My "locale" speaks English as the only language (which has only 26 letters,
BTW)).

>
>>> How those bytes are interpreted depends on the application.
>
>> So the distinction between text and binary mode is .... ?
>
> Arbitrary. It depends on the system. Under Unix, there isn't
> any. Under Windows, it's just the representation of '\n' in the
> file. Under other OS's, it's usually a different file type in
> the OS (and a file written in text mode can't be opened in
> binary, and vice versa).

I doesn't matter. "text file" is a valid concept.

>
>>>> Internally, the program is still working with ASCII strings,
>>>> assuming English is the language (PURE English that recognizes
>>>> only 26 letters, that is).
>
>>> Pure English has [...]
>
>> _I_ was giving the definition of "Pure English" in the context
>> (like a glossary). How many letters are there in the English
>> alphabet? How many?
>
> The same as in French, German or Italian: 26.

TY.

> However, in all
> four of these languages, you have cases where you need accents,

Accented words are either still being evaluated for inclusion into English
or are there for disambiguity. I used "Pure English" as that by which is
made up of only the 26 letters of the English alphabet.

>> Surely I wasn't taught umlauts in gradeschool.
>

> I was taught to spell na�ve correctly (although I don't know if

> it was grade school or high school).

'naive' has been naturalized into the English language and does not
have/does not require (unless one feels romantic?) an accent. You were
taught French, not English.

>>> Do you think anyone would use MS Office or Open Office if they
>>> only supported ASCII?
>
>> I was talking about simpler class of programs and libraries
>> even: say, a program's options file and the ini-file parser
>> (designated subset of 7-bit ASCII).
>
>> Apparently there is a semantic gap in our "debate". I'm not
>> sure where it is, but I think it may be in that you are
>> talking about what goes on behind the scenes in an OS, for
>> example, and I'm just using the simple ini-file parser using
>> some concoction called ASCIIString as the workhorse.
>
> All of the ini-files I've see do allow accented characters.

Again, so? You are suggesting that because you are bilingual or something
that all quest for simple elegance be thrown out the window? What is your
point?! (Certainly it is not engineering practicality).

>
>>> Programs assign semantics to those ones and zeros.
>>> Even at the hardware level---a float and an int may contain the
>>> same number of bits, but the code uses different instructions
>>> with them. Programs interpret the data.
>
>>> Which brings us back to my point above---you don't generally
>>> control how other programs are going to interpret the data you
>>> write.
>
>> If you say so. But if I specify that ini-files are for my
>> program may contain only the designated subset of 7-bit ASCII,
>> and someone puts an invalid character in there, expect a nasty
>> error box popping up.
>
> As long as you're the only user of your programs, that's fine.
> Once you have other users, you have to take their desires into
> consideration.

Don't get into politics, cuz you suck at it. Life is too short to get bogged
down in Unicode just because a trivial few feel that English should be
bastardized with unnaturalized ideas like 'naive' with a diacritic. That's
just naive (actually, just crappy engineering, IMO, but I couldn't resist
the "punch line").

>
>>> That would have been toward the end of the 1980s. And I
>>> haven't seen a program in the last ten years which didn't
>>> use symbols and have comments in either French or German.
>
>> But you're in/from France right? Us pesky "americans" huh. ;)
>
> Sort of:-).

Don't even go there: I'm NON-political and here for engineering pursuit (for
the most part).

> My mother was American, and I was born and raised in
> the United States. My father was German, my wife's Italian, and
> I currently live in France (but I've also lived a lot in
> Germany).

And this is relevant why???

> And yes, I do use four languages on an almost daily
> basis, so I'm somewhat sensitivized to the issue.

There is no issue: I am not developing international programs (or at least
not targeting any user other than those who can use English). Most programs
do not need internationalization. Overkill is overkill. "Cry me a f'n
river".

> But I find
> that even when working in an English language context, I need
> more than just ASCII.

Sometimes. Program option "inifiles" though? Apparently I've just suggested
to you a simplifying assumption that may indeed simplify your projects and
help you escape the narrowness of technology to some degree.

> And I find that regardless of what I
> need, the machines I use don't even offer ASCII as a choice.

I don't know what you mean. I think they all do.

>
>>>> I guarantee you that I'll never ever use/need 10646 comments
>>>> or symbols.
>
>>> Fine. If you write a compiler, and you're the only person to
>>> use it, you can do whatever you want. But there's no sense in
>>> talking about it here, since it has no relevance in the real
>>> world.
>
>> You're posting in extremism to promote generalism? Good
>> engineering includes exploiting simplifying assumptions (and
>> avoiding the hype, on the flip side). (You'd really put
>> non-ASCII characters in source code comments? Bizarre.)
>
> I have to, because my comments where I work now have to be in
> French, and French without accents is incomprehensible. The
> need is less frequent in English, but it does occur.

Simplify your life: use English (for SW dev at least)!

James Kanze

unread,

May 15, 2009, 5:05:45 AM5/15/09

to

On May 15, 6:40 am, "Tony" <t...@my.net> wrote:
> James Kanze wrote:
> > On May 10, 2:28 am, "Tony" <t...@my.net> wrote:
> >> James Kanze wrote:
> >>> On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
> >>>> James Kanze wrote:
> >>>>> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
> >>>>>> James Kanze wrote:
> >>>>>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
> >>>>>>>> James Kanze wrote:
> >> Millions of posts on USENET seem to contradict that statement.

> > In what way. The USENET doesn't require, or even encourage
> > ASCII.

> But the underlying protocol is NNTP, and while I don't know
> for sure, I have an incling that it is still a 7-bit protocol
> (?). But that wasn't my point. I was suggesting that most
> USENET posts in threaded discussion groups are ASCII (by
> nature of the characters in use by the posts).

And I'm simply pointing out that that is false. Even in this
group, I sometimes have problems with postings, because the
installed fonts on my machines at work only support ISO 8859-1.
(At home, I use UTF-8, and everything works.) Which doesn't
have things like opening and closing quotes.

> > My postings are in either ISO 8859-1 or UTF-8, depending
> > on the machine I'm posting from.

> You can call it what you want, but if it contains only ASCII
> characters, then I consider it an ASCII post.

But that's never the case for mine. And I see quite a few
others as well where it's not the case. Even in English
language groups like this one.

> > I couldn't post them in ASCII, because they always contain
> > accented characters.

> And that's your perogative. It's not English though and it
> introduces complexity where it is not necessary.

I'm not sure what you mean by "it's not English". "Naïve" is a
perfectly good English word. And English uses quotes and dashes
(which aren't available even in ISO 8859-1) and other various
symbols like § not available in ASCII in its punctuation. Not
to mention that a lot of groups handle mathematical topics, and
mathematics uses a lot of special symbols.

And of course, not all groups use (only) English.

> Claiming that unnaturalized words are rationale for "Unicode
> everywhere" is ludicrous (for lack of a better word that
> escapes my mind right now).

It has nothing to do with unnaturalized words (and I don't see
where "naïve" is unnaturalized). It has to do with recognizing
reality.

> My point was made just above. No need to drag locales into the
> discussion. (My "locale" speaks English as the only language
> (which has only 26 letters, BTW)).

And what does the number of letters have to do with it? French
also has only 26 letters. You still put accents on some of
them, and you still use punctuation.

[...]

> 'naive' has been naturalized into the English language and
> does not have/does not require (unless one feels romantic?) an
> accent. You were taught French, not English.

Merriam-Webster disagrees with you.

> > All of the ini-files I've see do allow accented characters.

> Again, so? You are suggesting that because you are bilingual
> or something that all quest for simple elegance be thrown out
> the window? What is your point?! (Certainly it is not
> engineering practicality).

My point is that software should be usable. And adapt to the
people using it, not vice versa. And that even in English, you
need more than simple ASCII. (At least, if you want to use
English correctly.)

[---]

> > As long as you're the only user of your programs, that's fine.
> > Once you have other users, you have to take their desires into
> > consideration.

> Don't get into politics, cuz you suck at it. Life is too short
> to get bogged down in Unicode just because a trivial few feel
> that English should be bastardized with unnaturalized ideas
> like 'naive' with a diacritic.

Or quotes. Or dashes. Or any number of other things. And that
"trivial few" includes the authors of all of the major
dictionaries I have access to.

If you don't know English well, that's your problem.

[...]

> > I have to, because my comments where I work now have to be in
> > French, and French without accents is incomprehensible. The
> > need is less frequent in English, but it does occur.

> Simplify your life: use English (for SW dev at least)!

If you've ever tried to understand English written by a
non-native speaker, you'll realize that it's much simpler to let
them use French (or German, when I worked there). Communication
is an important part of software engineering, and communication
is vastly improved if people can use their native language.

Richard Herring

unread,

May 15, 2009, 5:30:56 AM5/15/09

to

In message <in6Pl.33247$ZP4....@nlpi067.nbdc.sbc.com>, Tony
<to...@my.net> writes

>I used "Pure English" as that by which is
>made up of only the 26 letters of the English alphabet.

"Pure English" is a language only spoken by True Scotsmen [tm].

--
Richard Herring

Richard Herring

unread,

May 15, 2009, 5:27:37 AM5/15/09

to

In message <7p5Pl.26589$c45....@nlpi065.nbdc.sbc.com>, Tony

<to...@my.net> writes
>Richard Herring wrote:
>> In message <rqrNl.18828$8_3....@flpi147.ffdc.sbc.com>, Tony
>> <to...@my.net> writes
>>> Richard Herring wrote:
>>>> In message <UWOMl.18752$8_3....@flpi147.ffdc.sbc.com>, Tony
>>>> <to...@my.net> writes
>>>>> Jerry Coffin wrote:
[...]

>>>>>>> The English alphabet has 26 characters. No more, no less.
>>>>>>
>>>>>> Unfortunately statements like this weaken your point. By any
>>>>>> reasonable measure, the English alphabet contains at least 26
>>>>>> characters (upper and lower case).
>>>>>
>>>>> Fine, upper and lower case then. But no umlauts or accent marks!
>>>>
>>>> How na�ve. My _English_ dictionary includes d�j� vu, g�teau and many
>>>> other words with diacritics.
>>>
>>> And how many variable names do you create with those foreign glyphs?
>>> Hmm?
>>
>> Who cares? I'm merely providing a counterexample to your sweeping
>> claim that the English alphabet has exactly 26 characters. Or even 52.
>
>I meant letters, not characters.

That doesn't help you, since you need more than just those 26 or 52
letters to represent English words.

> It should be obvious from the CONTEXT ("eye
>on the ball" people!) that was what I meant.

It's irrelevant, since the real CONTEXT is not how many there are, but
whether you can write English with them.

>Perhaps you are trying
>opportunistically

?

>to imply something different.

No, I'm not making a pedantic point about the difference between
"letter" and "character". Surely it should be obvious that I'm simply
(re-)stating the fact that ASCII's repertoire is insufficient to
represent even English.

--
Richard Herring

Jerry Coffin

unread,

May 16, 2009, 10:41:16 AM5/16/09

to

In article <yw5Pl.26591$c45....@nlpi065.nbdc.sbc.com>, to...@my.net
says...

Not sure exactly what happened, but here's the ANSI web store page for
it:

http://webstore.ansi.org/RecordDetail.aspx?sku=ANSI+INCITS+4-1986+
(R2007)

Tony

unread,

May 19, 2009, 12:22:45 AM5/19/09

to

"Richard Herring" <junk@[127.0.0.1]> wrote in message
news:3NxF+VBJ...@baesystems.com...

> In message <7p5Pl.26589$c45....@nlpi065.nbdc.sbc.com>, Tony
> <to...@my.net> writes
>>Richard Herring wrote:
>>> In message <rqrNl.18828$8_3....@flpi147.ffdc.sbc.com>, Tony
>>> <to...@my.net> writes
>>>> Richard Herring wrote:
>>>>> In message <UWOMl.18752$8_3....@flpi147.ffdc.sbc.com>, Tony
>>>>> <to...@my.net> writes
>>>>>> Jerry Coffin wrote:
> [...]
>>>>>>>> The English alphabet has 26 characters. No more, no less.
>>>>>>>
>>>>>>> Unfortunately statements like this weaken your point. By any
>>>>>>> reasonable measure, the English alphabet contains at least 26
>>>>>>> characters (upper and lower case).
>>>>>>
>>>>>> Fine, upper and lower case then. But no umlauts or accent marks!
>>>>>
>>>>> How na�ve. My _English_ dictionary includes d�j� vu, g�teau and many
>>>>> other words with diacritics.
>>>>
>>>> And how many variable names do you create with those foreign glyphs?
>>>> Hmm?
>>>
>>> Who cares? I'm merely providing a counterexample to your sweeping
>>> claim that the English alphabet has exactly 26 characters. Or even 52.
>>
>>I meant letters, not characters.
>
> That doesn't help you, since you need more than just those 26 or 52
> letters to represent English words.

That's a strawman that conveniently avoids any context; The English alphabet
has exactly 26 letters.

>
>> It should be obvious from the CONTEXT ("eye
>>on the ball" people!) that was what I meant.
>
> It's irrelevant, since the real CONTEXT is not how many there are, but
> whether you can write English with them.

No, you are wrong: the context is the context, no some contrived generality
you expect some dummy to believe.

>
>>Perhaps you are trying
>>opportunistically
>
> ?
>
>>to imply something different.
>
> No, I'm not making a pedantic point about the difference between "letter"
> and "character".

Well you failed miserably then because you didn't say anything to that
effect: I did.

> Surely it should be obvious that I'm simply (re-)stating the fact that
> ASCII's repertoire is insufficient to represent even English.

ASCII is largely adequate: the English alphabet has 26 letters. I'm not
worried about the few unnaturalized foreign words that make it into
Webster's dictionary that have diacritics.

Tony

unread,

May 19, 2009, 12:26:12 AM5/19/09

to

"Jerry Coffin" <jco...@taeus.com> wrote in message
news:MPG.24787e76a...@news.sunsite.dk...

Well I'm not that interested. Curious yes, interested in buying something
where most likely nothing significant happened as noted by no one knowing of
anything significant happening, no.

(Jerry: are taglines out of style? Cuz you're still wearing bell-bottoms).

Tony

unread,

May 19, 2009, 1:10:42 AM5/19/09

to

James Kanze wrote:
> On May 15, 6:40 am, "Tony" <t...@my.net> wrote:
>> James Kanze wrote:
>>> On May 10, 2:28 am, "Tony" <t...@my.net> wrote:
>>>> James Kanze wrote:
>>>>> On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
>>>>>> James Kanze wrote:
>>>>>>> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
>>>>>>>> James Kanze wrote:
>>>>>>>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
>>>>>>>>>> James Kanze wrote:
>>>> Millions of posts on USENET seem to contradict that statement.
>
>>> In what way. The USENET doesn't require, or even encourage
>>> ASCII.
>
>> But the underlying protocol is NNTP, and while I don't know
>> for sure, I have an incling that it is still a 7-bit protocol
>> (?). But that wasn't my point. I was suggesting that most
>> USENET posts in threaded discussion groups are ASCII (by
>> nature of the characters in use by the posts).
>
> And I'm simply pointing out that that is false.

I don't believe you. Cuz I've been here off-and-on for years reading posts
and rarely do I find a non-ASCII character in a post (save for those
obsolete taglines).

> Even in this
> group, I sometimes have problems with postings, because the
> installed fonts on my machines at work only support ISO 8859-1.

And I have to use OE-QuoteFix to respond to YOUR posts. But 26 letters are
still just 26 letters. (Of course 10 digits are understood also).

> (At home, I use UTF-8, and everything works.) Which doesn't
> have things like opening and closing quotes.

I agree: you foreignors are messing things up. ;)

>
>>> My postings are in either ISO 8859-1 or UTF-8, depending
>>> on the machine I'm posting from.
>
>> You can call it what you want, but if it contains only ASCII
>> characters, then I consider it an ASCII post.
>
> But that's never the case for mine.

You mean your tagline? I think I may be noticing a trend toward being nice
and dropping those: even I hardly sign my posts anymnore (cuz it's stupid:
the newsreader will tell you who the post is from if you wanna know).

> And I see quite a few
> others as well where it's not the case. Even in English
> language groups like this one.

You're talking about standard encoding designations and I'm simply talking
about the best language to program in and to program to.

>
>>> I couldn't post them in ASCII, because they always contain
>>> accented characters.
>
>> And that's your perogative. It's not English though and it
>> introduces complexity where it is not necessary.
>
> I'm not sure what you mean by "it's not English".

It's not English because English has only 26 letters, without diacritics.

> "Na�ve" is a
> perfectly good English word.

The naturalized word 'naive' has been accepted into the English language but
the way you encoded it is still a foreign word.

> And English uses quotes and dashes
> (which aren't available even in ISO 8859-1)

You mean like dash as a separate character from minus?

> and other various
> symbols like � not available in ASCII in its punctuation.

Symbols are not word elements. The code page concept is symbols.

> Not
> to mention that a lot of groups handle mathematical topics, and
> mathematics uses a lot of special symbols.

Separate code pages.

>
> And of course, not all groups use (only) English.

That of course is ignoring the context: it is a strawman argument (at best,
but surely it is just propoganda).

>
>> Claiming that unnaturalized words are rationale for "Unicode
>> everywhere" is ludicrous (for lack of a better word that
>> escapes my mind right now).
>
> It has nothing to do with unnaturalized words (and I don't see

> where "na�ve" is unnaturalized). It has to do with recognizing
> reality.

Reality is that 'naive' is a naturalized English word and your encoding is a
foreign word: it has everything to do with naturalization.

>
>> My point was made just above. No need to drag locales into the
>> discussion. (My "locale" speaks English as the only language
>> (which has only 26 letters, BTW)).
>
> And what does the number of letters have to do with it?

Everything: I program in a spoken language and a programming language. I
chose my targets or at least know them: that is the context of the software
development.

> French
> also has only 26 letters.

That's misleading: French has diacritics, English does not.

> You still put accents on some of
> them, and you still use punctuation.

Strawman. You're trying to make a case for hieroglyphics as relevant. And to
me, if you want: I'm intuitive and like abstractions, but in a programming
paradigm, I don't want it wasting my life.

>
> [...]
>> 'naive' has been naturalized into the English language and
>> does not have/does not require (unless one feels romantic?) an
>> accent. You were taught French, not English.
>
> Merriam-Webster disagrees with you.

Ah! I mentioned Webster long ago in this thread and discounted any
relavence: but you grasp onto that because that is all you have:
cutting-edge colloquialism as definition of the English language. And you're
wrong big-time for all perspectives including the most important one in this
NG: engineering practicality.

>
>>> All of the ini-files I've see do allow accented characters.
>
>> Again, so? You are suggesting that because you are bilingual
>> or something that all quest for simple elegance be thrown out
>> the window? What is your point?! (Certainly it is not
>> engineering practicality).
>
> My point is that software should be usable.

I don't believe that that is your point at all: you have agenda, IMI (In My
Intuition).

> [---]
>>> As long as you're the only user of your programs, that's fine.
>>> Once you have other users, you have to take their desires into
>>> consideration.
>
>> Don't get into politics, cuz you suck at it. Life is too short
>> to get bogged down in Unicode just because a trivial few feel
>> that English should be bastardized with unnaturalized ideas
>> like 'naive' with a diacritic.
>
> Or quotes. Or dashes.

Separate issue. Degree.

> Or any number of other things.

Well why don't you list and number them (for progeny).

> And that
> "trivial few" includes the authors of all of the major
> dictionaries I have access to.

Dictionaries are of course political things. Your dictionary defense is
quite bizarre. It's akin to offering hieroglyphics as an argument: lame.

>
> If you don't know English well, that's your problem.

You mean if I don't want to accept bastardization/perversion it's my
problem.

>
> [...]
>>> I have to, because my comments where I work now have to be in
>>> French, and French without accents is incomprehensible. The
>>> need is less frequent in English, but it does occur.
>
>> Simplify your life: use English (for SW dev at least)!
>
> If you've ever tried to understand English written by a
> non-native speaker, you'll realize that it's much simpler to let
> them use French (or German, when I worked there).

Exceptional case.

> Communication
> is an important part of software engineering, and communication
> is vastly improved if people can use their native language.

Strawman/propoganda.

Richard Herring

unread,

May 19, 2009, 5:43:34 AM5/19/09

to

In message <lbrQl.10692$im1....@nlpi061.nbdc.sbc.com>, Tony

I think you need to check the definition of "strawman".

>that conveniently avoids any context; The English alphabet
>has exactly 26 letters.

(And the Welsh alphabet has 28, despite lacking J, K, Q, V, X, Z). So
what? 26 letters alone are not sufficient for writing English.

>
>>
>>> It should be obvious from the CONTEXT ("eye
>>>on the ball" people!) that was what I meant.
>>
>> It's irrelevant, since the real CONTEXT is not how many there are, but
>> whether you can write English with them.
>
>No, you are wrong: the context is the context, no some contrived generality
>you expect some dummy to believe.

Nor is it what you want to redefine it to be, as any "dummy" can
discover by simply reading the thread. "7-bit ASCII is your friend".

>
>>
>>>Perhaps you are trying
>>>opportunistically
>>
>> ?
>>
>>>to imply something different.
>>
>> No, I'm not making a pedantic point about the difference between "letter"
>> and "character".
>
>Well you failed miserably then because you didn't say anything to that
>effect: I did.

Eh? I failed miserably to say something I wasn't trying to say?

>
>> Surely it should be obvious that I'm simply (re-)stating the fact that
>> ASCII's repertoire is insufficient to represent even English.
>
>ASCII is largely adequate:

Largely. Thank you for that concession.

>the English alphabet has 26 letters.

So you keep telling us.

>I'm not
>worried about the few unnaturalized foreign words that make it into
>Webster's dictionary that have diacritics.
>

Fine; that's your choice. And if the customers for your software are
equally not worried that it can't cope with such words, that's even more
fine.

But _you_ don't get to define what's "unnaturalized", "foreign" or
"Pure English".

http://en.wikipedia.org/wiki/No_true_Scotsman

--
Richard Herring

James Kanze

unread,

May 19, 2009, 6:10:46 AM5/19/09

to

On May 19, 7:10 am, "Tony" <t...@my.net> wrote:
> James Kanze wrote:
> > On May 15, 6:40 am, "Tony" <t...@my.net> wrote:
> >> James Kanze wrote:
> >>> On May 10, 2:28 am, "Tony" <t...@my.net> wrote:
> >>>> James Kanze wrote:
> >>>>> On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
> >>>>>> James Kanze wrote:
> >>>>>>> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
> >>>>>>>> James Kanze wrote:
> >>>>>>>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
> >>>>>>>>>> James Kanze wrote:
> >>>> Millions of posts on USENET seem to contradict that statement.

> >>> In what way. The USENET doesn't require, or even encourage
> >>> ASCII.

> >> But the underlying protocol is NNTP, and while I don't know
> >> for sure, I have an incling that it is still a 7-bit protocol
> >> (?). But that wasn't my point. I was suggesting that most
> >> USENET posts in threaded discussion groups are ASCII (by
> >> nature of the characters in use by the posts).

> > And I'm simply pointing out that that is false.

> I don't believe you.

It's easy enough to verify. I often have problems with postings
because they contain characters which aren't present in ISO
8859-1 (which are the only encodings for which fonts are
installed on my machines at work).

[...]

> > (At home, I use UTF-8, and everything works.) Which doesn't
> > have things like opening and closing quotes.

> I agree: you foreignors are messing things up. ;)

Opening and closing quotes are part of English. At least, part
of the English used by people who've gotten beyond kindergarden.

> >>> My postings are in either ISO 8859-1 or UTF-8, depending
> >>> on the machine I'm posting from.

> >> You can call it what you want, but if it contains only ASCII
> >> characters, then I consider it an ASCII post.

> > But that's never the case for mine.

> You mean your tagline?

I don't have a "tagline". In fact, I don't know what you mean
by a "tagline". My .sig uses accented characters, because it
contains my address. I'll also occasionally use characters
outside of the 96 basic characters in the body of my postings:
things like a section reference (§) when quoting the standard,
for example, or a non-breaking space.

If I had UTF-8 everywhere, I'd also quote correctly.

[...]

> > I'm not sure what you mean by "it's not English".

> It's not English because English has only 26 letters, without
> diacritics.

So the Merriam Webster Dictionary is not English (since it
contains diacritics on some words, and uses opening and closing
quotes, and a lot of other characters other than the 26
letters).

> > "Naïve" is a perfectly good English word.

> The naturalized word 'naive' has been accepted into the
> English language but the way you encoded it is still a foreign
> word.

Not according to Merriam Webster. But of course, you know more
about English than the standard dictionaries.

> > And English uses quotes and dashes (which aren't available
> > even in ISO 8859-1)

> You mean like dash as a separate character from minus?

A minus sign, a hyphen, an n-dash and an m-dash are four
separate characters. Because I don't have the dashes in ISO
8859-1, I simulate them with -- and ---, but it's really a hack.

> > and other various symbols like § not available in ASCII in
> > its punctuation.

> Symbols are not word elements. The code page concept is
> symbols.

Nor are blanks. Are you saying that the encoding shouldn't
support blanks either?

> > Not to mention that a lot of groups handle mathematical
> > topics, and mathematics uses a lot of special symbols.

> Separate code pages.

What the hell is a "code page"?

> >> Claiming that unnaturalized words are rationale for
> >> "Unicode everywhere" is ludicrous (for lack of a better
> >> word that escapes my mind right now).

> > It has nothing to do with unnaturalized words (and I don't

> > see where "naïve" is unnaturalized). It has to do with
> > recognizing reality.

> Reality is that 'naive' is a naturalized English word and your
> encoding is a foreign word:

Not according to any of the dictionaries I've consulted. All
give "naïve" as a perfectly correct, native American English
spelling.

> >> My point was made just above. No need to drag locales into
> >> the discussion. (My "locale" speaks English as the only
> >> language (which has only 26 letters, BTW)).

> > And what does the number of letters have to do with it?

> Everything: I program in a spoken language and a programming
> language. I chose my targets or at least know them: that is
> the context of the software development.

The context of software development is that each programming
language defines a set of characters it accepts. Fortran used
the least, I believe---it was designed so that you could get six
6 bit characters in a word. C and C++ require close to a
million.

> > French also has only 26 letters.

> That's misleading: French has diacritics, English does not.

Your talk about letters is what is misleading. I'm just
pointing out that it's irrelevant.

> > [...]
> >> 'naive' has been naturalized into the English language and
> >> does not have/does not require (unless one feels romantic?)
> >> an accent. You were taught French, not English.

> > Merriam-Webster disagrees with you.

> Ah! I mentioned Webster long ago in this thread and discounted
> any relavence:

Merriam-Webster is irrelevant to what is correct American
English use?

> > [---]

> > If you don't know English well, that's your problem.

> You mean if I don't want to accept bastardization/perversion
> it's my problem.

I mean that if you don't want to accept generally accepted,
standard usage, it's your problem. A serious one, at that,
symptomatic of a serious social maladjustment.

> > [...]
> >>> I have to, because my comments where I work now have to be in
> >>> French, and French without accents is incomprehensible. The
> >>> need is less frequent in English, but it does occur.

> >> Simplify your life: use English (for SW dev at least)!

> > If you've ever tried to understand English written by a
> > non-native speaker, you'll realize that it's much simpler to let
> > them use French (or German, when I worked there).

> Exceptional case.

Native English speakers represent less than 5% of the world's
population, which means that being a native English speaker is
the exceptional case.

Richard Herring

unread,

May 19, 2009, 7:14:17 AM5/19/09

to

In message
<aa543561-df5e-4936...@m24g2000vbp.googlegroups.com>,
James Kanze <james...@gmail.com> writes

>On May 19, 7:10 am, "Tony" <t...@my.net> wrote:

>> The naturalized word 'naive' has been accepted into the
>> English language but the way you encoded it is still a foreign
>> word.
>
>Not according to Merriam Webster. But of course, you know more
>about English than the standard dictionaries.

And FWIW the standard British-English dictionaries agree with M-W on
this.

--
Richard Herring

Alf P. Steinbach

unread,

May 19, 2009, 8:43:15 AM5/19/09

to

* James Kanze:

>
>> It's not English because English has only 26 letters, without
>> diacritics.
>
> So the Merriam Webster Dictionary is not English (since it
> contains diacritics on some words, and uses opening and closing
> quotes, and a lot of other characters other than the 26
> letters).

I always found it a bit amusing that the English alphabet officially has only A
through Z, but that the language contains words like "m�neuver". :-) And no, not
making that up. I last encountered that last week, reading Jack London's "White
Fang", I think it was (if it wasn't the other dog book).

Cheers,

- Alf

Richard Herring

unread,

May 19, 2009, 10:34:32 AM5/19/09

to

In message <guu9ku$re9$1...@news.eternal-september.org>, Alf P. Steinbach
<al...@start.no> writes

>* James Kanze:
>>
>>> It's not English because English has only 26 letters, without
>>> diacritics.
>> So the Merriam Webster Dictionary is not English (since it
>> contains diacritics on some words, and uses opening and closing
>> quotes, and a lot of other characters other than the 26
>> letters).
>
>I always found it a bit amusing that the English alphabet officially
>has only A through Z, but that the language contains words like

>"mæneuver".

ITYM "manœuvre". HTH.

--
Richard Herring

Alf P. Steinbach

unread,

May 19, 2009, 10:44:09 AM5/19/09

to

* Richard Herring:

Thanks, possibly. But as I recall the speling in Jack London's novel started
with "mæ". Someone borrowed that book though, and I'm too lazy to check out
Oxford's or Merriam Webster (as I recall it's not in all dictionaries).

Richard Herring

unread,

May 19, 2009, 12:16:08 PM5/19/09

to

In message <guugnk$ldr$1...@news.eternal-september.org>, Alf P. Steinbach

<al...@start.no> writes
>* Richard Herring:
>> In message <guu9ku$re9$1...@news.eternal-september.org>, Alf P.
>>Steinbach <al...@start.no> writes

>>> I always found it a bit amusing that the English alphabet officially

>>>has only A through Z, but that the language contains words like
>>>"mæneuver".
>> ITYM "manœuvre". HTH.
>
>Thanks, possibly. But as I recall the speling in Jack London's novel
>started with "mæ". Someone borrowed that book though, and I'm too lazy
>to check out Oxford's or Merriam Webster (as I recall it's not in all
>dictionaries).

I'd be surprised if it's in _any_.

"White Fang" at http://www.gutenberg.org/files/910/910.txt and "The
Call of the Wild" at http://www.gutenberg.org/files/215/215.txt each
contain one instance of "manoeuvre" and one "manoeuvred". Neither has
any words beginning with "mae".

--
Richard Herring

Alf P. Steinbach

unread,

May 19, 2009, 12:41:08 PM5/19/09

to

* Richard Herring:
> In message <guugnk$ldr$1...@news.eternal-september.org>, Alf P. Steinbach
> <al...@start.no> writes
>> * Richard Herring:
>>> In message <guu9ku$re9$1...@news.eternal-september.org>, Alf P.
>>> Steinbach <al...@start.no> writes
>
>>>> I always found it a bit amusing that the English alphabet officially
>>>> has only A through Z, but that the language contains words like
>>>> "mæneuver".
>>> ITYM "manœuvre". HTH.
>>
>> Thanks, possibly. But as I recall the speling in Jack London's novel
>> started with "mæ". Someone borrowed that book though, and I'm too lazy
>> to check out Oxford's or Merriam Webster (as I recall it's not in all
>> dictionaries).
>
> I'd be surprised if it's in _any_.
>
> "White Fang" at http://www.gutenberg.org/files/910/910.txt and "The
> Call of the Wild" at http://www.gutenberg.org/files/215/215.txt each
> contain one instance of "manoeuvre" and one "manoeuvred". Neither has
> any words beginning with "mae".

My edition is a paperback, I think Penguin. It has "æ".

Cheers & hth.,

Richard Herring

unread,

May 19, 2009, 1:26:30 PM5/19/09

to

In message <guuniv$f8b$1...@news.eternal-september.org>, Alf P. Steinbach

<al...@start.no> writes
>* Richard Herring:
>> In message <guugnk$ldr$1...@news.eternal-september.org>, Alf P.
>>Steinbach <al...@start.no> writes
>>> * Richard Herring:
>>>> In message <guu9ku$re9$1...@news.eternal-september.org>, Alf P.
>>>>Steinbach <al...@start.no> writes
>>
>>>>> I always found it a bit amusing that the English alphabet
>>>>>officially has only A through Z, but that the language contains
>>>>>words like "mæneuver".
>>>> ITYM "manœuvre". HTH.
>>>
>>> Thanks, possibly. But as I recall the speling in Jack London's novel
>>>started with "mæ". Someone borrowed that book though, and I'm too
>>>lazy to check out Oxford's or Merriam Webster (as I recall it's not
>>>in all dictionaries).
>> I'd be surprised if it's in _any_.
>> "White Fang" at http://www.gutenberg.org/files/910/910.txt and "The
>>Call of the Wild" at http://www.gutenberg.org/files/215/215.txt each
>>contain one instance of "manoeuvre" and one "manoeuvred". Neither has
>>any words beginning with "mae".
>
>My edition is a paperback, I think Penguin. It has "æ".

You think. But by your own admission you lent it out and can't check...

Google's counter, for what that's worth, estimates ~400 "results" for
"maeneuver", many obviously from the same source. Compare that with ~4
million for "manœuvre" (in both cases, it doesn't seem to care whether
you type the ligature or separate letters).

I'd say that was entirely compatible with people remembering that the
word has a ligature, but not remembering which pairs of letters should
be joined.

--
Richard Herring

Alf P. Steinbach

unread,

May 19, 2009, 2:02:59 PM5/19/09

to

* Richard Herring:
> In message <guuniv$f8b$1...@news.eternal-september.org>, Alf P. Steinbach
> <al...@start.no> writes
>> * Richard Herring:
>>> In message <guugnk$ldr$1...@news.eternal-september.org>, Alf P.
>>> Steinbach <al...@start.no> writes
>>>> * Richard Herring:
>>>>> In message <guu9ku$re9$1...@news.eternal-september.org>, Alf P.
>>>>> Steinbach <al...@start.no> writes
>>>
>>>>>> I always found it a bit amusing that the English alphabet
>>>>>> officially has only A through Z, but that the language contains
>>>>>> words like "mæneuver".
>>>>> ITYM "manœuvre". HTH.
>>>>
>>>> Thanks, possibly. But as I recall the speling in Jack London's novel
>>>> started with "mæ". Someone borrowed that book though, and I'm too
>>>> lazy to check out Oxford's or Merriam Webster (as I recall it's not
>>>> in all dictionaries).
>>> I'd be surprised if it's in _any_.
>>> "White Fang" at http://www.gutenberg.org/files/910/910.txt and "The
>>> Call of the Wild" at http://www.gutenberg.org/files/215/215.txt each
>>> contain one instance of "manoeuvre" and one "manoeuvred". Neither has
>>> any words beginning with "mae".
>>
>> My edition is a paperback, I think Penguin. It has "æ".
>
> You think. But by your own admission you lent it out and can't check...

What's the point of an insinuation like that?

I have not expressed any doubt about whether the book uses the 'æ' spelling, and
indicating otherwise is just dishonest (i.e., you are, above): I was not making
a touchy-feely think-that-perhaps-it-was-like-that /argument/, as you insinuate;
I was just reporting a *fact*.

I think the book edition I have is published by Penguin.

That printed book uses 'æ', while the online text you've found apparently
doesn't, presumably because it's ASCII text (note: ASCII doesn't have 'æ').

The word 'mæneuver', with 'æ', modulo speling, is in at least one main English
dictionary.

> Google's counter, for what that's worth, estimates ~400 "results" for
> "maeneuver", many obviously from the same source. Compare that with ~4
> million for "manœuvre" (in both cases, it doesn't seem to care whether
> you type the ligature or separate letters).

There's also probably a difference between British English and US English.

> I'd say that was entirely compatible with people remembering that the
> word has a ligature, but not remembering which pairs of letters should
> be joined.

It's my impression that the old (original?) spelling used 'æ', but anyways, I
can't recall ever seeingn the word spelled with 'œ'.

Cheers,

Alf P. Steinbach

unread,

May 19, 2009, 2:35:14 PM5/19/09

to

* Alf P. Steinbach:

Wait a minute, sorry.

I was in wrong frame of mind because I very recently yet again had a certain guy
attempting to stick needles in my back so to speak. I don't understand that he
never learns but he doesn't, and I get sort of upset by having to punch him down
again and again. And then for a while, after such debacle, I feel very
suspicious about anything that might look like needles being waved behind me...

It may be that you're right, that as a Norwegian (we have æøå but no œ) I've
consistently misread an œ as a Norwegian æ.

Could be! :-)

But anyways, the point was that the official alphabet of English, A through Z,
isn't sufficient to express all valid spellings of all English words...

Cheers & hth.,

James Kanze

unread,

May 20, 2009, 4:51:38 AM5/20/09

to

On May 19, 4:44 pm, "Alf P. Steinbach" <al...@start.no> wrote:
> * Richard Herring:

> > In message <guu9ku$re...@news.eternal-september.org>, Alf P. Steinbach

> > <al...@start.no> writes
> >> * James Kanze:

> >>>> It's not English because English has only 26 letters, without
> >>>> diacritics.
> >>> So the Merriam Webster Dictionary is not English (since it
> >>> contains diacritics on some words, and uses opening and closing
> >>> quotes, and a lot of other characters other than the 26
> >>> letters).

> >> I always found it a bit amusing that the English alphabet
> >> officially has only A through Z, but that the language
> >> contains words like "mæneuver".

> > ITYM "manœuvre". HTH.

> Thanks, possibly. But as I recall the speling in Jack London's
> novel started with "mæ". Someone borrowed that book though,
> and I'm too lazy to check out Oxford's or Merriam Webster (as
> I recall it's not in all dictionaries).

The word maneuver should be in any American English dictionary.
The American Heritage Dictionary also lists manoeuvre as a
"chiefly British variant". The word in French from which the
English derives is "manœuvre"; in this case, this spelling is
not (I believe) acceptable in the US, but it wouldn't surprise
me if it were acceptable, or even the preferred spelling, in
Great Britain.

If you saw mæneuver, it was a typo. (Such things do occur---I
remember one case where the typesetter went to extreme pains to
put a cedilla on the c in the French city of Mâcon. Which, of
course, has no cedilla on the c, but does have an accent
circumflex on the a, which the typesetter missed.)

But it's still the Encyclopædia Britannica (which dispite the
name, is published in Chicago).

All of which begs the question: what is a letter? (TeX provides
character encodings for various other ligatures, like fl or fi,
for example.) The French "standards" also speaks of 26 letters,
but not only has accents, but an obligatory œ (in many everyday
words, like cœur), which is in opposition to "oe" (in other
words, like coefficient). Both German and French collate
accented characters as if they were unaccented (up to a certain
point, at least), but Swedish (and maybe Norwegian) consider
them completely different letters, appended to the end of the
alphabet (and Spanish treats ll as if it were a different letter
than l, collating it after la-lz).

Richard Herring

unread,

May 20, 2009, 5:07:02 AM5/20/09

to

In message <guuu8u$87c$1...@news.eternal-september.org>, Alf P. Steinbach

OK. No insinuation intended, just clarifying the difference between
recollection and hard fact.

>
>I was in wrong frame of mind because I very recently yet again had a
>certain guy attempting to stick needles in my back so to speak. I don't
>understand that he never learns but he doesn't, and I get sort of upset
>by having to punch him down again and again. And then for a while,
>after such debacle, I feel very suspicious about anything that might
>look like needles being waved behind me...
>
>It may be that you're right, that as a Norwegian (we have æøå but no
>œ) I've consistently misread an œ as a Norwegian æ.
>
>Could be! :-)
>
>But anyways, the point was that the official alphabet of English, A
>through Z, isn't sufficient to express all valid spellings of all
>English words...

Indeed. Another example is the use in some English words of a diaeresis
to mark a syllable break, e.g. "coöperate" instead of "co-operate".

--
Richard Herring

Richard Herring

unread,

May 20, 2009, 5:18:40 AM5/20/09

to

In message
<fa2c7229-c523-460d...@o30g2000vbc.googlegroups.com>,
James Kanze <james...@gmail.com> writes

>All of which begs the question: what is a letter? (TeX provides
>character encodings for various other ligatures, like fl or fi,
>for example.) The French "standards" also speaks of 26 letters,
>but not only has accents, but an obligatory œ (in many everyday
>words, like cœur), which is in opposition to "oe" (in other
>words, like coefficient). Both German and French collate
>accented characters as if they were unaccented (up to a certain
>point, at least), but Swedish (and maybe Norwegian) consider
>them completely different letters, appended to the end of the
>alphabet (and Spanish treats ll as if it were a different letter
>than l, collating it after la-lz).
>

And any Welshman will tell you that his alphabet has 28 letters, despite
omitting J, K, Q, V, X, Z, because he collates ch, dd,ff, ll, ng, rh and
th as separate letters. And Hungarian has 40, which include gy, ly, ty
but not y iself (or 44 if you include the QWXY needed for foreign
words)...

--
Richard Herring

Jeff Schwab

unread,

May 20, 2009, 11:07:41 AM5/20/09

to

Richard Herring wrote:
> In message <guuu8u$87c$1...@news.eternal-september.org>, Alf P. Steinbach

>> the official alphabet of English, A

>> through Z, isn't sufficient to express all valid spellings of all
>> English words...
>
> Indeed. Another example is the use in some English words of a diaeresis
> to mark a syllable break, e.g. "coöperate" instead of "co-operate".

I've never even seen that. The hyphen looks awfully dated, too. A
through Z are sufficient for the vast majority of English text; while
diacritics, ligatures, and other decorations are occasionally used, they
are rarely if ever necessary.

Note that this says nothing of what general-purpose computer programming
languages ought to support. We don't live in an ASCII world, English is
not the only language, and frankly, I'm not thrilled with the fact that
bad English is becoming a sort of international pidgin language.

Richard Herring

unread,

May 20, 2009, 11:56:57 AM5/20/09

to

In message <M6ydnUmf0oghgYnX...@giganews.com>, Jeff Schwab
<je...@schwabcenter.com> writes

>Richard Herring wrote:
>> In message <guuu8u$87c$1...@news.eternal-september.org>, Alf P.
>>Steinbach
>
>>> the official alphabet of English, A through Z, isn't sufficient to
>>>express all valid spellings of all English words...
>> Indeed. Another example is the use in some English words of a

>>diaeresis to mark a syllable break, e.g. "co�perate" instead of

>>"co-operate".
>
>I've never even seen that.

I believe it only survives in the "New Yorker" ;-)

> The hyphen looks awfully dated, too. A through Z are sufficient for
>the vast majority of English text; while diacritics, ligatures, and
>other decorations are occasionally used, they are rarely if ever
>necessary.
>
>Note that this says nothing of what general-purpose computer
>programming languages ought to support. We don't live in an ASCII
>world, English is not the only language, and frankly, I'm not thrilled
>with the fact that bad English is becoming a sort of international
>pidgin language.

--
Richard Herring

osmium

unread,

May 20, 2009, 2:38:43 PM5/20/09

to

"Richard Herring" wrote:

> In message <M6ydnUmf0oghgYnX...@giganews.com>, Jeff Schwab
> <je...@schwabcenter.com> writes
>>Richard Herring wrote:
>>> In message <guuu8u$87c$1...@news.eternal-september.org>, Alf P. Steinbach
>>
>>>> the official alphabet of English, A through Z, isn't sufficient to
>>>> express all valid spellings of all English words...
>>> Indeed. Another example is the use in some English words of a diaeresis
>>> to mark a syllable break, e.g. "co�perate" instead of "co-operate".
>>
>>I've never even seen that.
>
> I believe it only survives in the "New Yorker" ;-)

What does that mean? I understood what it *said* until I got to the smiley.
Do you believe that the smiley has some basic and agreed upon meaning? If
so, what is that meaning and where is it defined? I believe many people use
it as a form of negation, kind of an equivalent of a SNL "not". As far as I
am concerned it has just become an annoying noise glyph which may alter the
meaning of what is said and may not.

James Kanze

unread,

May 21, 2009, 4:30:30 AM5/21/09

to

On May 20, 8:38 pm, "osmium" <r124c4u...@comcast.net> wrote:
> "Richard Herring" wrote:

> > In message <M6ydnUmf0oghgYnXnZ2dnUVZ_uudn...@giganews.com>, Jeff Schwab
> > <j...@schwabcenter.com> writes
> >>Richard Herring wrote:
> >>> In message <guuu8u$87...@news.eternal-september.org>, Alf P. Steinbach

> >>>> the official alphabet of English, A through Z, isn't
> >>>> sufficient to express all valid spellings of all English
> >>>> words...
> >>> Indeed. Another example is the use in some English words of a diaeresis

> >>> to mark a syllable break, e.g. "coöperate" instead of "co-operate".

> >>I've never even seen that.

> > I believe it only survives in the "New Yorker" ;-)

> What does that mean? I understood what it *said* until I got
> to the smiley. Do you believe that the smiley has some basic
> and agreed upon meaning?

It means that the preceding sentence isn't to be taken too
seriously. If the sentence had been spoken, the author would
have been smiling when he said it.

> If so, what is that meaning and where is it defined? I
> believe many people use it as a form of negation, kind of an
> equivalent of a SNL "not".

Not really. It means that the statement is being said in a
joking manner. In some contexts, this might imply that it is
false (but not at all in the same way that the "not" does), but
certainly not in all contexts.

> As far as I am concerned it has just become an annoying noise
> glyph which may alter the meaning of what is said and may not.

It corresponds to something you'd use tone of voice or
expression to indicate in spoken English. It's used on the net
because the level of English here is often closer to spoken
English than it is to traditional written English.

Richard Herring

unread,

May 21, 2009, 4:59:09 AM5/21/09

to

In message <77j11lF...@mid.individual.net>, osmium
<r124c...@comcast.net> writes

>"Richard Herring" wrote:
>
>> In message <M6ydnUmf0oghgYnX...@giganews.com>, Jeff Schwab
>> <je...@schwabcenter.com> writes
>>>Richard Herring wrote:
>>>> In message <guuu8u$87c$1...@news.eternal-september.org>, Alf P. Steinbach
>>>
>>>>> the official alphabet of English, A through Z, isn't sufficient to
>>>>> express all valid spellings of all English words...
>>>> Indeed. Another example is the use in some English words of a diaeresis
>>>> to mark a syllable break, e.g. "co�perate" instead of "co-operate".
>>>
>>>I've never even seen that.
>>
>> I believe it only survives in the "New Yorker" ;-)
>
>What does that mean? I understood what it *said* until I got to the smiley.
>Do you believe that the smiley has some basic and agreed upon meaning? If
>so, what is that meaning and where is it defined?

http://www.ccil.org/jargon/jargon_20.html#TAG550
http://en.wikipedia.org/wiki/Emoticon

> I believe many people use
>it as a form of negation, kind of an equivalent of a SNL "not".

Not in my experience.

> As far as I
>am concerned it has just become an annoying noise glyph which may alter the
>meaning of what is said and may not.
>

:-(
--
Richard Herring

Jeff Schwab

unread,

May 21, 2009, 9:03:34 AM5/21/09

to

Richard Herring wrote:
> In message <77j11lF...@mid.individual.net>, osmium
> <r124c...@comcast.net> writes
>> "Richard Herring" wrote:

>>>>> Another example is the use in some English words of a
>>>>> diaeresis
>>>>> to mark a syllable break, e.g. "co�perate" instead of "co-operate".
>>>>
>>>> I've never even seen that.
>>>
>>> I believe it only survives in the "New Yorker" ;-)
>>
>> What does that mean? I understood what it *said* until I got to the
>> smiley.
>> Do you believe that the smiley has some basic and agreed upon
>> meaning? If
>> so, what is that meaning and where is it defined?

Without the emoticon, the statement could be taken to mean, in all
seriousness, that diacritics were specific to a particular publication.
The emoticon makes it clear that The New Yorker is instead serving as
a metaphor for a portion of the literary community that is either
sophisticated or pretentious, depending on whom you ask.

>> As far as I
>> am concerned it has just become an annoying noise glyph which may
>> alter the
>> meaning of what is said and may not.
>>
> :-(

I'm not ordinarily the sort of person who writes "LOL," but... :)

Tony

unread,

May 21, 2009, 11:59:38 PM5/21/09

to

"Richard Herring" <junk@[127.0.0.1]> wrote in message

news:TI1c$oFG9n...@baesystems.com...

Probably, but you know what I meant.

>
>>that conveniently avoids any context; The English alphabet
>>has exactly 26 letters.
>
> (And the Welsh alphabet has 28, despite lacking J, K, Q, V, X, Z). So
> what? 26 letters alone are not sufficient for writing English.

Who's talking about literary writings?! I'm talking about programming and
engineering.

>>
>>>
>>>> It should be obvious from the CONTEXT ("eye
>>>>on the ball" people!) that was what I meant.
>>>
>>> It's irrelevant, since the real CONTEXT is not how many there are, but
>>> whether you can write English with them.
>>
>>No, you are wrong: the context is the context, no some contrived
>>generality
>>you expect some dummy to believe.
>
> Nor is it what you want to redefine it to be, as any "dummy" can discover
> by simply reading the thread. "7-bit ASCII is your friend".

That quote is true, at least if you know how to do it. I went on to say:
"no, probably not for you, but indeed it is for me!". So you misquoted me:
do you enjoy taking things out of context and using them opportunistically
evilly?

>>> Surely it should be obvious that I'm simply (re-)stating the fact that
>>> ASCII's repertoire is insufficient to represent even English.
>>
>>ASCII is largely adequate:
>
> Largely. Thank you for that concession.

Twas no concession. The implication was that some applications, by mandate
or desire, are specified to target other languages or language elements
(letters mainly, we are talking about).

>
>>the English alphabet has 26 letters.
>
> So you keep telling us.

That's the definition. It's also a great (if not obvious) engineering
recognition.

>
>>I'm not
>>worried about the few unnaturalized foreign words that make it into
>>Webster's dictionary that have diacritics.
>>
> Fine; that's your choice. And if the customers for your software are
> equally not worried that it can't cope with such words, that's even more
> fine.
>
> But _you_ don't get to define what's "unnaturalized", "foreign" or "Pure
> English".

You are free sprinkle hieroglyphics into your source code variable names
rather than using just ASCII. You really should stop trying to get me to do
so though, cuz it ain't gonna ever happen.

Tony

unread,

May 22, 2009, 1:18:37 AM5/22/09

to

James Kanze wrote:
> On May 19, 7:10 am, "Tony" <t...@my.net> wrote:
>> James Kanze wrote:
>>> On May 15, 6:40 am, "Tony" <t...@my.net> wrote:
>>>> James Kanze wrote:
>>>>> On May 10, 2:28 am, "Tony" <t...@my.net> wrote:
>>>>>> James Kanze wrote:
>>>>>>> On May 8, 3:02 am, "Tony" <t...@my.net> wrote:
>>>>>>>> James Kanze wrote:
>>>>>>>>> On May 2, 12:10 pm, "Tony" <t...@my.net> wrote:
>>>>>>>>>> James Kanze wrote:
>>>>>>>>>>> On Apr 29, 9:18 am, "Tony" <t...@my.net> wrote:
>>>>>>>>>>>> James Kanze wrote:
>>>>>> Millions of posts on USENET seem to contradict that statement.
>
>>>>> In what way. The USENET doesn't require, or even encourage
>>>>> ASCII.
>
>>>> But the underlying protocol is NNTP, and while I don't know
>>>> for sure, I have an incling that it is still a 7-bit protocol
>>>> (?). But that wasn't my point. I was suggesting that most
>>>> USENET posts in threaded discussion groups are ASCII (by
>>>> nature of the characters in use by the posts).
>
>>> And I'm simply pointing out that that is false.
>
>> I don't believe you.
>
> It's easy enough to verify.

The first part about NNTP was not the main point in the passage: the fact
that most USENET posts in threaded discussion groups contain only 7-bit
ASCII characters is what I was hypothesizing.

>>> (At home, I use UTF-8, and everything works.) Which doesn't
>>> have things like opening and closing quotes.
>
>> I agree: you foreignors are messing things up. ;)
>
> Opening and closing quotes are part of English. At least, part
> of the English used by people who've gotten beyond kindergarden.

And there's room for them in a 7-bit definition. The question is whether
it's worth it. You're hardly making any case for Unicode with the quotes
argument.

>
>>>>> My postings are in either ISO 8859-1 or UTF-8, depending
>>>>> on the machine I'm posting from.
>
>>>> You can call it what you want, but if it contains only ASCII
>>>> characters, then I consider it an ASCII post.
>
>>> But that's never the case for mine.
>
>> You mean your tagline?
>
> I don't have a "tagline". In fact, I don't know what you mean
> by a "tagline". My .sig uses accented characters, because it
> contains my address.

".sig" then. I've been calling it a tagline.

> I'll also occasionally use characters
> outside of the 96 basic characters in the body of my postings:
> things like a section reference (�) when quoting the standard,
> for example, or a non-breaking space.

That's not a letter: it's a symbol. Booch notation is it's own contrivation
(like a "codepage") also. Blueprints and wiring diagrams have bunches of
symbols. They are not part of the English language and are not additions to
the 26 recognized characters of the English language.

>>> I'm not sure what you mean by "it's not English".
>
>> It's not English because English has only 26 letters, without
>> diacritics.
>
> So the Merriam Webster Dictionary is not English

Published dictionaries have similarities to publlished bibles, that's all
they are: one point of view. Not recognizing the environment of this NG and
bringing everything remotely related to the discussion is dreary (at best).

>>> "Na�ve" is a perfectly good English word.

>
>> The naturalized word 'naive' has been accepted into the
>> English language but the way you encoded it is still a foreign
>> word.
>
> Not according to Merriam Webster.

Merriam Webster is lame "appeal to authority" argument. Especially given all
the dialog to date in just this thread.

> But of course, you know more
> about English than the standard dictionaries.

See "lame appeal to authority" above. Also see "condescension/defamation as
a weapon or as a last resort".

>
>>> And English uses quotes and dashes (which aren't available
>>> even in ISO 8859-1)
>
>> You mean like dash as a separate character from minus?
>
> A minus sign, a hyphen, an n-dash and an m-dash are four
> separate characters.

Yet C++ embraces contextual meaning. (Aside). While there is room for
separate characters for the aforementioned in a -bit encoding, you've not
made any case for them being valuable as separate characters in an encoding,
and therefor much much less case for such a thing in ALL encodings (or the
proverbial "The Encoding to End all Encodings").

> Because I don't have the dashes in ISO
> 8859-1, I simulate them with -- and ---, but it's really a hack.

You want dash, I'm OK with a dash character added to the punctuation set
(it's just for data though and not code).

>
>>> and other various symbols like � not available in ASCII in
>>> its punctuation.
>
>> Symbols are not word elements. The code page concept is
>> symbols.
>
> Nor are blanks. Are you saying that the encoding shouldn't
> support blanks either?

You should have a better understanding of my needs by now. Perhaps this post
will be the "keystone" for you in that regard.

>
>>> Not to mention that a lot of groups handle mathematical
>>> topics, and mathematics uses a lot of special symbols.
>
>> Separate code pages.
>
> What the hell is a "code page"?

You know very well of the concept, so stop being facetious.

>
>>>> Claiming that unnaturalized words are rationale for
>>>> "Unicode everywhere" is ludicrous (for lack of a better
>>>> word that escapes my mind right now).
>
>>> It has nothing to do with unnaturalized words (and I don't

>>> see where "na�ve" is unnaturalized). It has to do with

>>> recognizing reality.
>
>> Reality is that 'naive' is a naturalized English word and your
>> encoding is a foreign word:
>
> Not according to any of the dictionaries I've consulted.

Said to Grasshopper: "don't believe all that you read Grasshoppa".

> All
> give "na�ve" as a perfectly correct, native American English
> spelling.

That's a false statement. It is predicated upon the notion of pre-existing
definition (the "god win" "argument"?) and that that "definition" is
accepted by the reader/recipient. (The same reasoning is abused in judicial
systems, BTW). Funny you should choose the word 'naive' to try and make your
point! (Grasshoppa). ;) (?)

>
>>>> My point was made just above. No need to drag locales into
>>>> the discussion. (My "locale" speaks English as the only
>>>> language (which has only 26 letters, BTW)).
>
>>> And what does the number of letters have to do with it?
>
>> Everything: I program in a spoken language and a programming
>> language. I chose my targets or at least know them: that is
>> the context of the software development.
>
> The context of software development is that each programming
> language defines a set of characters it accepts. Fortran used
> the least, I believe---it was designed so that you could get six
> 6 bit characters in a word. C and C++ require close to a
> million.

My application(s) require(s) no more than 7-bit ASCII. Why continue trying
to sell me a combine when I don't own a farm?

>
>>> French also has only 26 letters.
>
>> That's misleading: French has diacritics, English does not.
>
> Your talk about letters is what is misleading. I'm just
> pointing out that it's irrelevant.

You're not pointing out anything. English has 26 characters in its alphabet.
While English-speaking people MAY recognize foreign words or symbols or
characters, it is irrelevant to this discussion thread because the context
was given that 7-bit ASCII is adequate and all supersets are irrelevant
therefor. (BTW, if you want the statue back, you have to pay for the
maintenance of the elephant since you "gave" it to "us").

>
>>> [...]
>>>> 'naive' has been naturalized into the English language and
>>>> does not have/does not require (unless one feels romantic?)
>>>> an accent. You were taught French, not English.
>
>>> Merriam-Webster disagrees with you.
>
>> Ah! I mentioned Webster long ago in this thread and discounted
>> any relavence:
>
> Merriam-Webster is irrelevant to what is correct American
> English use?

"use" is not under the glass in this thread. I can and do make up words all
day long (OK, I don't do it all day long) ... OK, maybe a few have caught
on.. but "valley girl speak"? Who cares? (Not my problem.. your's with kids
maybe). But valley-girl variable names will probably obey the rules of
English in source code. If I say "there is a fire in your hair" and choose
not to recognize what I said because it doesn't contain your "preferred
diacritic", it's not my problem.

>>> [---]
>>> If you don't know English well, that's your problem.
>
>> You mean if I don't want to accept bastardization/perversion
>> it's my problem.
>
> I mean that if you don't want to accept generally accepted,
> standard usage, it's your problem.

You missed the whole point of the thread. Which is it: YOU are stupid, or
you think everyone else is stupid? It seems it has to be one or the other.
(There of course is another option, but I won't go there).

> A serious one, at that,
> symptomatic of a serious social maladjustment.

Oooo... bring on the Psych 101 when you can't get your way. (Tempting to
play with you, but you will have to cook to find the answers). No offense
grasshoppa.

>
>>> [...]
>>>>> I have to, because my comments where I work now have to be in
>>>>> French, and French without accents is incomprehensible. The
>>>>> need is less frequent in English, but it does occur.
>
>>>> Simplify your life: use English (for SW dev at least)!
>
>>> If you've ever tried to understand English written by a
>>> non-native speaker, you'll realize that it's much simpler to let
>>> them use French (or German, when I worked there).
>
>> Exceptional case.
>
> Native English speakers represent less than 5% of the world's
> population, which means that being a native English speaker is
> the exceptional case.

"Native English" is MUCH different than "English". "the exceptional case" is
exactly that which _I_ define for my application. So your attempt at using
generality as specific argument is quite boring and irrelevant. (I admit
it's a bit fun to wallow as I procrastinate). :P

Tony

unread,

May 22, 2009, 1:26:53 AM5/22/09

to

Alf P. Steinbach wrote:
> * James Kanze:
>>
>>> It's not English because English has only 26 letters, without
>>> diacritics.
>>
>> So the Merriam Webster Dictionary is not English (since it
>> contains diacritics on some words, and uses opening and closing
>> quotes, and a lot of other characters other than the 26
>> letters).
>
> I always found it a bit amusing that the English alphabet officially
> has only A through Z, but that the language contains words like
> "m�neuver".

"Dialect based upon" is just that. I code in C++. Sure I do: with all my
preprocessor macros trying to get anything useful done? Yeah, it's C++!
(Sure it is).

Tony

unread,

May 22, 2009, 1:29:29 AM5/22/09

to

Alf P. Steinbach wrote:
> * Richard Herring:
>> In message <guuniv$f8b$1...@news.eternal-september.org>, Alf P.
>> Steinbach <al...@start.no> writes
>>> * Richard Herring:
>>>> In message <guugnk$ldr$1...@news.eternal-september.org>, Alf P.
>>>> Steinbach <al...@start.no> writes
>>>>> * Richard Herring:
>>>>>> In message <guu9ku$re9$1...@news.eternal-september.org>, Alf P.
>>>>>> Steinbach <al...@start.no> writes
>>>>
>>>>>>> I always found it a bit amusing that the English alphabet
>>>>>>> officially has only A through Z, but that the language contains

>>>>>>> words like "m�neuver".
>>>>>> ITYM "manouvre". HTH.

>>>>>
>>>>> Thanks, possibly. But as I recall the speling in Jack London's

>>>>> novel started with "m�". Someone borrowed that book though, and

>>>>> I'm too lazy to check out Oxford's or Merriam Webster (as I
>>>>> recall it's not in all dictionaries).
>>>> I'd be surprised if it's in _any_.
>>>> "White Fang" at http://www.gutenberg.org/files/910/910.txt and
>>>> "The Call of the Wild" at
>>>> http://www.gutenberg.org/files/215/215.txt each contain one
>>>> instance of "manoeuvre" and one "manoeuvred". Neither has any
>>>> words beginning with "mae".
>>>

>>> My edition is a paperback, I think Penguin. It has "�".

>>
>> You think. But by your own admission you lent it out and can't
>> check...
>
> What's the point of an insinuation like that?
>

> I have not expressed any doubt about whether the book uses the '�'

> spelling, and indicating otherwise is just dishonest (i.e., you are,
> above): I was not making a touchy-feely
> think-that-perhaps-it-was-like-that /argument/, as you insinuate; I
> was just reporting a *fact*.
> I think the book edition I have is published by Penguin.
>

> That printed book uses '�', while the online text you've found

> apparently doesn't, presumably because it's ASCII text (note: ASCII

> doesn't have '�').
> The word 'm�neuver', with '�', modulo speling, is in at least one

> main English dictionary.
>
>
>> Google's counter, for what that's worth, estimates ~400 "results" for
>> "maeneuver", many obviously from the same source. Compare that with

>> ~4 million for "manouvre" (in both cases, it doesn't seem to care

>> whether you type the ligature or separate letters).
>
> There's also probably a difference between British English and US
> English.
>
>> I'd say that was entirely compatible with people remembering that the
>> word has a ligature, but not remembering which pairs of letters
>> should be joined.
>

> It's my impression that the old (original?) spelling used '�', but
> anyways, I can't recall ever seeingn the word spelled with 'o'.
>

The "debate" (is there one?) is about character encodings and their
applicability. Not the elephant of liberty.

Tony

unread,

May 22, 2009, 1:33:44 AM5/22/09

to

osmium wrote:
> Do you believe that the smiley has some basic and agreed upon
> meaning? If so, what is that meaning and where is it defined? I
> believe many people use it as a form of negation, kind of an
> equivalent of a SNL "not". As far as I am concerned it has just
> become an annoying noise glyph which may alter the meaning of what is
> said and may not.

Emoticons are just that. They are different from Acticons, perhaps subtlely
and occassionally. ("Get my drift or don't and I don't care if you do or
don't" emoticon here).

Tony

unread,

May 22, 2009, 1:53:34 AM5/22/09

to

James Kanze wrote:
> On May 20, 8:38 pm, "osmium" <r124c4u...@comcast.net> wrote:
>> "Richard Herring" wrote:
>>> In message <M6ydnUmf0oghgYnXnZ2dnUVZ_uudn...@giganews.com>, Jeff
>>> Schwab <j...@schwabcenter.com> writes
>>>> Richard Herring wrote:
>>>>> In message <guuu8u$87...@news.eternal-september.org>, Alf P.
>>>>> Steinbach
>
>>>>>> the official alphabet of English, A through Z, isn't
>>>>>> sufficient to express all valid spellings of all English
>>>>>> words...
>>>>> Indeed. Another example is the use in some English words of a

>>>>> diaeresis to mark a syllable break, e.g. "co�perate" instead of

>>>>> "co-operate".
>
>>>> I've never even seen that.
>
>>> I believe it only survives in the "New Yorker" ;-)
>
>> What does that mean? I understood what it *said* until I got
>> to the smiley. Do you believe that the smiley has some basic
>> and agreed upon meaning?
>
> It means that the preceding sentence isn't to be taken too
> seriously.

It must have been the "wink smilie" then (OE-QuoteFix (?) is filtering them
out here). That smilie means different things in different contexts. (Does
Meriam Webster define 'smiley' or 'smilie'?.... surely they are only
confused (being optimistically unrealistic) capitalists).

> If the sentence had been spoken, the author would
> have been smiling when he said it.

??? Or winking? Now I'm curious about which emoticon you are talking about
because you totally said SOMEthing incorrect above. (Not that inane
trivialities are important, as much as they are better than sitcoms though).

>
>> If so, what is that meaning and where is it defined? I
>> believe many people use it as a form of negation, kind of an
>> equivalent of a SNL "not".
>
> Not really. It means that the statement is being said in a
> joking manner.

That's incorrect. A "winkie" is context-specific. It could be flirtatious
(or creepy!) in addition to a RANGE of other implications. There is no
defintion of the emoticon. (Note: the search for defintion is a common
quest, but never taught! Often "capitalised" (to be nice) on).

> In some contexts, this might imply that it is
> false (but not at all in the same way that the "not" does), but
> certainly not in all contexts.

Needs to be at the beginning of the description. Some aliens (surely not
people) will stop reading your above and base things upon it.

>
>> As far as I am concerned it has just become an annoying noise
>> glyph which may alter the meaning of what is said and may not.
>
> It corresponds to something you'd use tone of voice

Most likely not.

> or
> expression

Vague? You meant more verbage? (You really should have your wife answer the
posts where you think you can fill in for her).

> to indicate in spoken English.

I have to laugh, not at you. OK, at you. '

> It's used on the net
> because the level of English here is often closer to spoken
> English than it is to traditional written English.

I don't feel sorry for you.