Account Options

  1. Sign in
The old Google Groups will be going away soon, but your browser is incompatible with the new version.
Google Groups Home
« Groups Home
Message from discussion String trim (was JavaScript Functions)
The group you are posting to is a Usenet group. Messages posted to this group will make your email address visible to anyone on the Internet.
Your reply message has not been sent.
Your post was successful
 
From:
To:
Cc:
Followup To:
Add Cc | Add Followup-to | Edit Subject
Subject:
Validation:
For verification purposes please type the characters you see in the picture below or the numbers you hear by clicking the accessibility icon. Listen and type the numbers you hear
 
Richard Cornford  
View profile  
 More options Feb 12 2009, 6:59 pm
Newsgroups: comp.lang.javascript
From: "Richard Cornford" <Rich...@litotes.demon.co.uk>
Date: Thu, 12 Feb 2009 23:59:49 -0000
Local: Thurs, Feb 12 2009 6:59 pm
Subject: Re: String trim (was JavaScript Functions)

kangax wrote:
> Garrett Smith wrote:
> [...]
>> Is this fixed in recent JScript in IE8?

> I'll check this later, will let you know.

If that question was whether \s in a regular expression matches '\u00A0'
in IE 8 then it appears that the answer is no.

>> Question: Why not:-

>> function trimString(s){
>>   return s.replace(/^[\s|\xA0]+|[\s|\xA0]+$/g, '');
>> }

> That would work, sure.

<snip>

By "work" I assume you mean that it corrects the fact that some
javascript engines do not match '\u00A0' (the non-breaking space
character) when \s is used in a regular expression. But is this an
attempt to create methods that conform to the ECMA (3rd Ed.)
specification, and thus are consistent across platforms, or is it a
desire to arbitrarily include the non-breaking space in the set of
matched characters for a trim function.

The former seems the more reasonable goal, but in that case this trim
function still falls short as where '\u00A0' is not matched the odds are
extremely good that '\u2003' (EM space), to name but one, is not matched
either. So the above function will still produce inconsistent results
between javascript engines.

The ECMA spec wants \s to match javascript's whit space characters and
it line terminator characters, but the definition of whitespace includes
all of Unicode's Zs group, and JScript, for example, does not match the
majority of those.

Try this out:-

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
   "http://www.w3.org/TR/html4/loose.dtd">
<html>
    <head>
        <title></title>
    </head>
    <body>
<pre>
<script type="text/javascript">

var WhiteSpace = [
    {
        cp:"9", codePoint:"0x0009",  character :"\u0009",
        name:"<control>[ASCII Tab]",     group:"Cc"
    },
    {
        cp:"11",    codePoint:"0x000B",  character :"\u000B",
        name:"<control>[ASCII Vertical Tab]",    group:"Cc"
    },
    {
        cp:"12",    codePoint:"0x000C",  character :"\u000C",
        name:"<control>[ASCII Form Feed]",   group:"Cc"
    },
    {
        cp:"32",    codePoint:"0x0020",  character :"\u0020",
        name:"SPACE",    group:"Zs"
    },
    {
        cp:"160",   codePoint:"0x00A0",  character :"\u00A0",
        name:"NO-BREAK SPACE",   group:"Zs"
    },
    {
        cp:"5760",  codePoint:"0x1680",  character :"\u1680",
        name:"OGHAM SPACE MARK",     group:"Zs"
    },
    {
        cp:"6158",  codePoint:"0x180E",  character :"\u180E",
        name:"MONGOLIAN VOWEL SEPARATOR",    group:"Zs"
    },
    {
        cp:"8192",  codePoint:"0x2000",  character :"\u2000",
        name:"EN QUAD",  group:"Zs"
    },
    {
        cp:"8193",  codePoint:"0x2001",  character :"\u2001",
        name:"EM QUAD",  group:"Zs"
    },
    {
        cp:"8194",  codePoint:"0x2002",  character :"\u2002",
        name:"EN SPACE",     group:"Zs"
    },
    {
        cp:"8195",  codePoint:"0x2003",  character :"\u2003",
        name:"EM SPACE",     group:"Zs"
    },
    {
        cp:"8196",  codePoint:"0x2004",  character :"\u2004",
        name:"THREE-PER-EM SPACE",   group:"Zs"
    },
    {
        cp:"8197",  codePoint:"0x2005",  character :"\u2005",
        name:"FOUR-PER-EM SPACE",    group:"Zs"
    },
    {
        cp:"8198",  codePoint:"0x2006",  character :"\u2006",
        name:"SIX-PER-EM SPACE",     group:"Zs"
    },
    {
        cp:"8199",  codePoint:"0x2007",  character :"\u2007",
        name:"FIGURE SPACE",     group:"Zs"
    },
    {
        cp:"8200",  codePoint:"0x2008",  character :"\u2008",
        name:"PUNCTUATION SPACE",    group:"Zs"
    },
    {
        cp:"8201",  codePoint:"0x2009",  character :"\u2009",
        name:"THIN SPACE",   group:"Zs"
    },
    {
        cp:"8202",  codePoint:"0x200A",  character :"\u200A",
        name:"HAIR SPACE",   group:"Zs"
    },
    {
        cp:"8239",  codePoint:"0x202F",  character :"\u202F",
        name:"NARROW NO-BREAK SPACE",    group:"Zs"
    },
    {
        cp:"8287",  codePoint:"0x205F",  character :"\u205F",
        name:"MEDIUM MATHEMATICAL SPACE",    group:"Zs"
    },
    {
        cp:"12288",codePoint:"0x3000",   character :"\u3000",
        name:"IDEOGRAPHIC SPACE",    group:"Zs"
    }
];

var LineTerminator = [
    {
        cp:"8232",  codePoint:"0x2028",  character :"\u2028",
        name:"LINE SEPARATOR",   group:"Zl"
    },
    {
        cp:"8233",  codePoint:"0x2029",  character :"\u2029",
        name:"PARAGRAPH SEPARATOR",  group:"Zp"
    },
    {
        cp:"10",    codePoint:"0x000A",  character :"\u000A",
        name:"<control>[ASCII Line Feed]",   group:"Cc"
    },
    {
        cp:"13",    codePoint:"0x000D",  character :"\u000D",
        name:"<control>[ASCII Carriage Return]",     group:"Cc"
    }
];

var NonWhiteSpace = [
    {
      cp:"8203",  codePoint:"0x200B",  character :"\u200B",
      name:"ZERO WIDTH SPACE (will be whitespace in ES 3.1)",
      group:"Cf"
    },
    {
      cp:"8204",  codePoint:"0x200C",  character :"\u200C",
      name:"ZERO WIDTH NON-JOINER",    group:"Cf"
    },
    {
      cp:"8205",  codePoint:"0x200D",  character :"\u200D",
      name:"ZERO WIDTH JOINER",    group:"Cf"
    },
    {
      cp:"65279", codePoint:"0xFEFF",  character :"\uFEFF",
      name:"ZERO WIDTH NO-BREAK SPACE (will be whitespace in ES 3.1)",
      group:"Cf"
    }
];

var testAr = [WhiteSpace, LineTerminator];

var testRX = /\s/g;
function testWS(chObj){
    var char, stripped, matches;
    if(!(char =  String.fromCharCode(+chObj.cp))){
        return 'Anomaly: None empty string is false.\n'
    }

    /* First verify that the String.fromCharCode result matches the
       Unicode escape sequence based string literal for the character.
    */
    if(char != chObj.character){
        return 'Anomaly in String.String.fromCharCode('+chObj.cp+')\n'
    }
    if((stripped = char.replace(testRX, '')) == chObj.character){
        matches = false;
    }else{
        matches = true;
    }
    return (
        matches+
        '\t"'+stripped+'"\tcp = '+chObj.codePoint+'\tname = '+
        chObj.name+'  group = '+chObj.group+'\n'
    );

}

var  c, cp, list, ctr;
document.write('Should be matched by \\s\n');
for(c = 0;c < testAr.length;++c){
    list = testAr[c];
    for(cp = 0;cp < list.length;++cp){
        ctr = list[cp];
        document.write(testWS(ctr)
           .replace('true', 'GOOD')
           .replace('false', 'FAIL'));

    }

}

document.write('\n\nMust not be matched by \\s (in ES 3)\n');
for(cp = 0;cp < NonWhiteSpace.length;++cp){
    ctr = NonWhiteSpace[cp];
    document.write(testWS(ctr)
      .replace('true', 'FAIL')
      .replace('false', 'GOOD'));

}

</script>
</pre>
    </body>
</html>

Richard.


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.