Account Options

  1. Sign in
The old Google Groups will be going away soon, but your browser is incompatible with the new version.
Google Groups Home
« Groups Home
utf8 patch
There are currently too many topics in this group that display first. To make this topic appear first, remove this option from another topic.
There was an error processing your request. Please try again.
flag
  19 messages - Collapse all  -  Translate all to Translated (View all originals)
The group you are posting to is a Usenet group. Messages posted to this group will make your email address visible to anyone on the Internet.
Your reply message has not been sent.
Your post was successful
 
From:
To:
Cc:
Followup To:
Add Cc | Add Followup-to | Edit Subject
Subject:
Validation:
For verification purposes please type the characters you see in the picture below or the numbers you hear by clicking the accessibility icon. Listen and type the numbers you hear
 
jira  
View profile  
 More options May 23 2010, 8:55 am
From: jira <j...@prosyn.org>
Date: Sun, 23 May 2010 05:55:43 -0700 (PDT)
Local: Sun, May 23 2010 8:55 am
Subject: utf8 patch
Hello,

lately I wrote about problem with caching utf8 strings. I then said
the problems were outside CHI. Well, actually I was wrong.
Now I found some time to dig into it.
The situation is as follows: if you cache utf8 string (the utf8 flag
on) then some drivers won't store it at all(
File driver gives the "wide character in syswrite error".
Other drivers will store it but will choke on get because, they will
try to deseralize it even as it was not serialized. That is due to the
error in unpack metadata, where the given length on metadata is
different for byte vs character semantics.

My suggested solution is to do encode on all input data (keys and
values) and decode it back on output.

Below is my suggested patch. I'm not at all sure it is the best way to
do it, bu it seems to work.

Regards,
Jiri

--------------------------------------------------------------------------- ---------------------------------------------------------------------------

diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm
CHI/CacheObject.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm   2010-05-22
23:55:18.000000000 +0200
+++ CHI/CacheObject.pm  2010-05-22 23:41:12.000000000 +0200
@@ -3,6 +3,8 @@
 use strict;
 use warnings;

+use CHI::Util qw(_decode_utf8 _encode_utf8);
+
 use constant f_key              => 0;
 use constant f_raw_value        => 1;
 use constant f_serializer       => 2;
@@ -121,7 +123,7 @@
         if ( $self->[f_is_serialized] ) {
             $value = $self->serializer->deserialize($value);
         }
-        $self->[f_value] = $value;
+        $self->[f_value] = _decode_utf8 $value;
     }
     return $self->[f_value];
 }
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm CHI/
Driver.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm        2010-05-22
23:55:18.000000000 +0200
+++ CHI/Driver.pm       2010-05-23 14:38:49.000000000 +0200
@@ -8,7 +8,7 @@
 use CHI::Driver::Role::IsSubcache;
 use CHI::Driver::Role::Universal;
 use CHI::Serializer::Storable;
-use CHI::Util qw(has_moose_class parse_duration);
+use CHI::Util qw(has_moose_class parse_duration _decode_utf8
_encode_utf8);
 use CHI::Types;
 use Log::Any qw($log);
 use Moose;
@@ -18,6 +18,7 @@
 use strict;
 use warnings;

+
 my $default_serializer = CHI::Serializer::Storable->new();

 has 'chi_root_class'     => ( is => 'ro' );
@@ -143,6 +144,8 @@
     croak "must specify key" unless defined($key);
     my $ns_stats = $self->{ns_stats};

+    $key = _encode_utf8($key);
+
     # Fetch cache object
     #
     my $data = $params{data};
@@ -212,6 +215,7 @@
 sub get_expires_at {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     if ( my $obj = $self->get_object($key) ) {
         return $obj->expires_at;
@@ -224,6 +228,7 @@
 sub exists_and_is_expired {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     if ( my $obj = $self->get_object($key) ) {
         return $obj->is_expired;
@@ -236,6 +241,7 @@
 sub is_valid {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     if ( my $obj = $self->get_object($key) ) {
         return !$obj->is_expired;
@@ -259,6 +265,9 @@
     croak "must specify key" unless defined($key);
     return unless defined($value);

+    $key   = _encode_utf8($key);
+    $value = _encode_utf8($value);
+
     # Fill in $options if not passed, copy if passed, and apply
defaults.
     #
     if ( !defined($options) ) {
@@ -341,6 +350,7 @@
 sub expire {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     my $time = $Test_Time || time();
     if ( defined( my $obj = $self->get_object($key) ) ) {
@@ -356,7 +366,7 @@
     my ( $self, $key, $code ) = @_;
     croak "must specify key and code" unless defined($key) &&
defined($code);

-    if ( my $obj = $self->get_object($key) ) {
+    if ( my $obj = $self->get_object(_encode_utf8 $key) ) {
         my $retval = $code->($obj);
         if ($retval) {
             $self->expire($key);
@@ -383,7 +393,7 @@
 sub fetch_multi_hashref {
     my ( $self, $keys ) = @_;

-    return { map { ( $_, $self->fetch($_) ) } @$keys };
+    return { map { ( $_, $self->fetch(_encode_utf8 $_) ) } @$keys };
 }

 sub get_multi_hashref {
@@ -422,7 +432,7 @@
     croak "must specify keys" unless defined($keys);

     foreach my $key (@$keys) {
-        $self->remove($key);
+        $self->remove(_encode_utf8 $key);
     }
 }

@@ -442,7 +452,7 @@
     my %hash;
     foreach my $key ( $self->get_keys() ) {
         if ( defined( my $value = $self->get($key) ) ) {
-            $hash{$key} = $value;
+            $hash{_decode_utf8 $key} = $value;
         }
     }
     return \%hash;
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/Util.pm CHI/
Util.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/Util.pm  2010-05-22
23:55:18.000000000 +0200
+++ CHI/Util.pm 2010-05-22 23:38:09.000000000 +0200
@@ -5,6 +5,7 @@
 use Fcntl qw( :DEFAULT );
 use File::Spec::Functions qw(catdir catfile);
 use Time::Duration::Parse;
+use Encode;
 use strict;
 use warnings;
 use base qw(Exporter);
@@ -20,6 +21,8 @@
   read_dir
   unique_id
   write_file
+  _decode_utf8
+  _encode_utf8
 );

 my $Fetch_Flags = O_RDONLY | O_BINARY;
@@ -145,6 +148,30 @@
     return ( defined $meta && $meta->isa("Moose::Meta::Class") );
 }

+# stolen from CGI.pm
+sub _decode_utf8 {
+  my $val = shift;
+
+  if (Encode::is_utf8($val)) {
+    return $val;
+  }
+  else {
+    return Encode::decode(utf8 => $val);
+  }
+}
+
+sub _encode_utf8 {
+  my $val = shift;
+
+  if (Encode::is_utf8($val)) {
+    return Encode::encode(utf8 => $val);
+  }
+  else {
+    return $val;
+  }
+}
+
+
 1;

 __END__

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cache-discuss@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-discuss+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Jonathan Swartz  
View profile  
 More options May 23 2010, 9:55 am
From: Jonathan Swartz <swa...@pobox.com>
Date: Sun, 23 May 2010 06:55:51 -0700
Local: Sun, May 23 2010 9:55 am
Subject: Re: utf8 patch
Thanks Jiri. To be honest when you said the problem was outside CHI I  
didn't quite believe it either, because a few other people have  
complained - I just haven't taken the time to figure out what to do  
about it. :)

Can a second person who understands encoding vouch for Jiri's approach  
- not necessarily the exact implementation, but the concept of  
encoding keys and values as they come in and decoding them as they go  
out?

Jon

On May 23, 2010, at 5:55 AM, jira wrote:

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cache-discuss@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-discuss+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.

 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Tim Bunce  
View profile  
 More options May 23 2010, 9:56 am
From: Tim Bunce <Tim.Bu...@pobox.com>
Date: Sun, 23 May 2010 09:56:20 -0400
Local: Sun, May 23 2010 9:56 am
Subject: Re: utf8 patch
I've no idea how appropriate this patch is or what the issues are
(I'm not paying attention) but from a performance perspective
I'd be concerned.

I'd rewrite these more like:

On Sun, May 23, 2010 at 05:55:43AM -0700, jira wrote:
> +sub _decode_utf8 {
> +  my $val = shift;
> +
> +  if (Encode::is_utf8($val)) {
> +    return $val;
> +  }
> +  else {
> +    return Encode::decode(utf8 => $val);
> +  }
> +}

my $utf8_enc = Encode::find_encoding('utf8');

sub _decode_utf8 {
    return shift unless Encode::is_utf8($_[0]);
    return $utf8_enc->decode(shift);

}

and similarly for _encode_utf8. But for better performance I'd
drop the functions put the logic directly where it's needed.

Tim.

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cache-discuss@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-discuss+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Perrin Harkins  
View profile  
 More options May 23 2010, 1:03 pm
From: Perrin Harkins <per...@elem.com>
Date: Sun, 23 May 2010 13:03:23 -0400
Local: Sun, May 23 2010 1:03 pm
Subject: Re: utf8 patch

On Sun, May 23, 2010 at 8:55 AM, jira <j...@prosyn.org> wrote:
> The situation is as follows: if you cache utf8 string (the utf8 flag
> on) then some drivers won't store it at all(
> File driver gives the "wide character in syswrite error".
> Other drivers will store it but will choke on get because, they will
> try to deseralize it even as it was not serialized.

I'm guessing there are some which already handle it.  It would be good
to allow skipping any CHI encoding on a per-driver basis, so that
drivers which already support unicode don't do extra work.

- Perrin

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cache-discuss@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-discuss+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
jira  
View profile  
 More options May 23 2010, 4:23 pm
From: jira <j...@prosyn.org>
Date: Sun, 23 May 2010 13:23:26 -0700 (PDT)
Local: Sun, May 23 2010 4:23 pm
Subject: Re: utf8 patch

Perrin Harkins wrote:
> On Sun, May 23, 2010 at 8:55 AM, jira <j...@prosyn.org> wrote:
> > The situation is as follows: if you cache utf8 string (the utf8 flag
> > on) then some drivers won't store it at all(
> > File driver gives the "wide character in syswrite error".
> > Other drivers will store it but will choke on get because, they will
> > try to deseralize it even as it was not serialized.

> I'm guessing there are some which already handle it.  It would be good
> to allow skipping any CHI encoding on a per-driver basis, so that
> drivers which already support unicode don't do extra work.

The problem  is on CHI level not driver level. In the pack_to_data /
unpack_from_data methods at least.
As it concatenates the utf8 data with the metadata the whole string
gets upgraded to utf8 and then
one gets bogus values reading the metadata back.
So I think the drivers should not care about encoding at all. They
will just get passed bytes and thats it.
If we encode all the data, then it solves problems for the rest of the
code).

perlunifaq also suggests to encode/decode all data:
http://search.cpan.org/~jesse/perl-5.12.1/pod/perlunifaq.pod

As for the exact implementation there are sure many (and probably
better) ways.

Kind Regards,
Jiri

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cache-discuss@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-discuss+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
jira  
View profile  
 More options May 23 2010, 4:40 pm
From: jira <j...@prosyn.org>
Date: Sun, 23 May 2010 13:40:12 -0700 (PDT)
Local: Sun, May 23 2010 4:40 pm
Subject: Re: utf8 patch

Thanks, this is indeed much faster.
Just to be precise the logic should be inverted:

 sub _decode_utf8 {
     return shift if Encode::is_utf8($_[0]);
     return $utf8_enc->decode(shift);
 }

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cache-discuss@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-discuss+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
jira  
View profile  
 More options May 23 2010, 8:09 pm
From: jira <j...@prosyn.org>
Date: Sun, 23 May 2010 17:09:25 -0700 (PDT)
Local: Sun, May 23 2010 8:09 pm
Subject: Re: utf8 patch

> As for the exact implementation there are sure many (and probably
> better) ways.

My patch indeed was a crap. It blindely encoded/decoded all values
even if they were references. So here is it corrected.
You will probably take a different route, but just to correct myself.

Regards,
Jiri

--------------------------------------------------------------------------- -----------------------------------------------------------------
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm
CHI/CacheObject.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm   2010-05-22
23:55:18.000000000 +0200
+++ CHI/CacheObject.pm  2010-05-24 02:02:00.000000000 +0200
@@ -3,6 +3,8 @@
 use strict;
 use warnings;

+use CHI::Util qw(_decode_utf8 _encode_utf8);
+
 use constant f_key              => 0;
 use constant f_raw_value        => 1;
 use constant f_serializer       => 2;
@@ -52,6 +54,9 @@
         $raw_value     = $serializer->serialize($raw_value);
         $is_serialized = 1;
     }
+    else {
+      $raw_value = _encode_utf8($raw_value);
+    }

     # Not sure where this should be set and checked
     #
@@ -121,7 +126,10 @@
         if ( $self->[f_is_serialized] ) {
             $value = $self->serializer->deserialize($value);
         }
-        $self->[f_value] = $value;
+        else {
+            $value = _decode_utf8($value);
+        }
+        $self->[f_value] =  $value;
     }
     return $self->[f_value];
 }
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm CHI/
Driver.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm        2010-05-22
23:55:18.000000000 +0200
+++ CHI/Driver.pm       2010-05-24 02:00:48.000000000 +0200
@@ -8,7 +8,7 @@
 use CHI::Driver::Role::IsSubcache;
 use CHI::Driver::Role::Universal;
 use CHI::Serializer::Storable;
-use CHI::Util qw(has_moose_class parse_duration);
+use CHI::Util qw(has_moose_class parse_duration _decode_utf8
_encode_utf8);
 use CHI::Types;
 use Log::Any qw($log);
 use Moose;
@@ -18,6 +18,7 @@
 use strict;
 use warnings;

+
 my $default_serializer = CHI::Serializer::Storable->new();

 has 'chi_root_class'     => ( is => 'ro' );
@@ -143,6 +144,8 @@
     croak "must specify key" unless defined($key);
     my $ns_stats = $self->{ns_stats};

+    $key = _encode_utf8($key);
+
     # Fetch cache object
     #
     my $data = $params{data};
@@ -212,6 +215,7 @@
 sub get_expires_at {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     if ( my $obj = $self->get_object($key) ) {
         return $obj->expires_at;
@@ -224,6 +228,7 @@
 sub exists_and_is_expired {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     if ( my $obj = $self->get_object($key) ) {
         return $obj->is_expired;
@@ -236,6 +241,7 @@
 sub is_valid {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     if ( my $obj = $self->get_object($key) ) {
         return !$obj->is_expired;
@@ -259,6 +265,8 @@
     croak "must specify key" unless defined($key);
     return unless defined($value);

+    $key   = _encode_utf8($key);
+
     # Fill in $options if not passed, copy if passed, and apply
defaults.
     #
     if ( !defined($options) ) {
@@ -341,6 +349,7 @@
 sub expire {
     my ( $self, $key ) = @_;
     croak "must specify key" unless defined($key);
+    $key = _encode_utf8($key);

     my $time = $Test_Time || time();
     if ( defined( my $obj = $self->get_object($key) ) ) {
@@ -356,7 +365,7 @@
     my ( $self, $key, $code ) = @_;
     croak "must specify key and code" unless defined($key) &&
defined($code);

-    if ( my $obj = $self->get_object($key) ) {
+    if ( my $obj = $self->get_object(_encode_utf8 $key) ) {
         my $retval = $code->($obj);
         if ($retval) {
             $self->expire($key);
@@ -383,7 +392,7 @@
 sub fetch_multi_hashref {
     my ( $self, $keys ) = @_;

-    return { map { ( $_, $self->fetch($_) ) } @$keys };
+    return { map { ( $_, $self->fetch(_encode_utf8 $_) ) } @$keys };
 }

 sub get_multi_hashref {
@@ -422,7 +431,7 @@
     croak "must specify keys" unless defined($keys);

     foreach my $key (@$keys) {
-        $self->remove($key);
+        $self->remove(_encode_utf8 $key);
     }
 }

@@ -442,7 +451,7 @@
     my %hash;
     foreach my $key ( $self->get_keys() ) {
         if ( defined( my $value = $self->get($key) ) ) {
-            $hash{$key} = $value;
+            $hash{_decode_utf8 $key} = $value;
         }
     }
     return \%hash;
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/Util.pm CHI/
Util.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/Util.pm  2010-05-22
23:55:18.000000000 +0200
+++ CHI/Util.pm 2010-05-22 23:38:09.000000000 +0200
@@ -5,6 +5,7 @@
 use Fcntl qw( :DEFAULT );
 use File::Spec::Functions qw(catdir catfile);
 use Time::Duration::Parse;
+use Encode;
 use strict;
 use warnings;
 use base qw(Exporter);
@@ -20,6 +21,8 @@
   read_dir
   unique_id
   write_file
+  _decode_utf8
+  _encode_utf8
 );

 my $Fetch_Flags = O_RDONLY | O_BINARY;
@@ -145,6 +148,30 @@
     return ( defined $meta && $meta->isa("Moose::Meta::Class") );
 }

+# stolen from CGI.pm
+sub _decode_utf8 {
+  my $val = shift;
+
+  if (Encode::is_utf8($val)) {
+    return $val;
+  }
+  else {
+    return Encode::decode(utf8 => $val);
+  }
+}
+
+sub _encode_utf8 {
+  my $val = shift;
+
+  if (Encode::is_utf8($val)) {
+    return Encode::encode(utf8 => $val);
+  }
+  else {
+    return $val;
+  }
+}
+
+
 1;

 __END__

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cache-discuss@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-discuss+unsubscribe@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Aristotle Pagaltzis  
View profile  
 More options May 29 2010, 8:42 pm
From: Aristotle Pagaltzis <pagalt...@gmx.de>
Date: Sun, 30 May 2010 02:42:57 +0200
Local: Sat, May 29 2010 8:42 pm
Subject: Re: utf8 patch
* Jonathan Swartz <swa...@pobox.com> [2010-05-23 15:55]:

> Can a second person who understands encoding vouch for Jiri's
> approach - not necessarily the exact implementation, but the
> concept of encoding keys and values as they come in and
> decoding them as they go out?

His patch is not sane. The utility functions look at the UTF8
flag to decide what to do, which is a broken approach, by
definition. (The UTF8 flag signifies the internal format of the
byte buffer of the string, but it says nothing about whether
a string consists of characters or bytes.) It will hide some
problems but cause others.

I do not have enough knowledge of CHI to give good advice on how
to proceed, however.

How many 3rd party drivers exist? How important is it that a new
version of CHI which fixes this problem be backward-compatible
with old drivers? (It will probably require changes to the driver
interface to fix it correctly.)

Regards,
--
Aristotle Pagaltzis // <http://plasmasturm.org/>


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Jonathan Swartz  
View profile  
 More options May 30 2010, 8:58 am
From: Jonathan Swartz <swa...@pobox.com>
Date: Sun, 30 May 2010 05:58:14 -0700
Local: Sun, May 30 2010 8:58 am
Subject: Re: utf8 patch

On May 29, 2010, at 5:42 PM, Aristotle Pagaltzis wrote:

> * Jonathan Swartz <swa...@pobox.com> [2010-05-23 15:55]:
>> Can a second person who understands encoding vouch for Jiri's
>> approach - not necessarily the exact implementation, but the
>> concept of encoding keys and values as they come in and
>> decoding them as they go out?

> His patch is not sane. The utility functions look at the UTF8
> flag to decide what to do, which is a broken approach, by
> definition. (The UTF8 flag signifies the internal format of the
> byte buffer of the string, but it says nothing about whether
> a string consists of characters or bytes.) It will hide some
> problems but cause others.

I'm not so sure. I know that using is_utf8 is generally Wrong. But  
CHI's role is not to interpret the data in any way, merely to store  
and retrieve it and make sure it doesn't change in that process. In  
that case, isn't it the right thing to make sure the utf8 flag is set  
exactly the same after a store and retrieve?

For example, this script

    #!/usr/bin/perl -w
    use Encode;
    use Storable qw(freeze thaw);
    use strict;

    my ($in, $out);

    $in = "\x{263a}b";
    $out = thaw(freeze([$in]))->[0];
    print "is_utf8 before Storable: " . (Encode::is_utf8($in) ? 't' :  
'f') . "\n";
    print "is_utf8 after Storable: " . (Encode::is_utf8($out) ? 't' :  
'f') . "\n\n";

    $in = join('', map { chr($_) } (226, 152, 186, 98));
    $out = thaw(freeze([$in]))->[0];
    print "is_utf8 before Storable: " . (Encode::is_utf8($in) ? 't' :  
'f') . "\n";
    print "is_utf8 after Storable: " . (Encode::is_utf8($out) ? 't' :  
'f') . "\n\n";

prints

    is_utf8 before Storable: t
    is_utf8 after Storable: t

    is_utf8 before Storable: f
    is_utf8 after Storable: f

so Storable is preserving the is_utf8 flag. JSON does the same thing.  
We use Storable to serialize reference values, but we store scalar  
values raw, so we need to take responsibility for the utf8 flag in the  
scalar case.

Aristotle, please help me understand your objections better.

Thanks
Jon


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
jira  
View profile  
 More options May 30 2010, 9:44 am
From: jira <j...@prosyn.org>
Date: Sun, 30 May 2010 06:44:29 -0700 (PDT)
Local: Sun, May 30 2010 9:44 am
Subject: Re: utf8 patch

Hello,

admittedly, I'm not super knowledgeable about the perl Unicode and
probably should not have attempted the patch at all.
But I did some amount of reading and looking at what other code does
and I think the general approach I suggested is correct.

perlunicode says:
"When Perl exchanges data with an extension, the extension should be
able to understand the UTF-8 flag and act accordingly."

And basically it suggests the strategy I used. And that is also what I
do in my code to circumvent the problem
(encode/decode the strings).

Also, from what I've read it seems to me that the utf8 flag actually
does signifies character semantics. But I might be just confused.

Kind Regards,
Jiri


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Roland Lammel  
View profile  
 More options May 30 2010, 10:51 am
From: Roland Lammel <r...@quikit.at>
Date: Sun, 30 May 2010 16:51:07 +0200
Local: Sun, May 30 2010 10:51 am
Subject: Re: utf8 patch

Hi all,

Bein no export on Perl Unicode, I had to cope with UTF-8 data for quite some
time now. The most important rule from my experience (and I heard some echos
of that on some lists) is that you should make sure to decode when
reading/receiving data from external resources (network, file, database) and
make sure the encode when sending/writing data to external resources
(external here means not under your direct control anymore). The only
exception here is if your code should act completely transparent here or
performance requires some tricks here. Of course defining what external
means and where external starts is the hard part.

So for CHI this would mean, we need to consider keys and values here.
External would be the border where the writing to the backend occurs. As CHI
is a pure layer for adding cache functionality and it knows nothing about
the actaul semantic of the keys or values it needs to be hinted to what the
data actually is (meaning binary or encoded string). A way to deal with that
would be to allow to specify params the CHI to specify whether keys and or
values should be considered strings or binaries. Considering backwards
compatibility that would mean be binary by default, so the behaviour of CHI
would not change and introduce an encoding param (either just string with
automagic UTF-8 handling similar to the patch proposed or specify the
encoding to use).

This could allow dwimity, proper handling of encoding and backwards
compatibility by still allowing pure binary data as key/value. The downside
would be addtionial options when initializing CHI and adding a little
complexity the the core/drivers.

Thoughs?

Cheers

+rl

--
Roland Lammel
QuikIT - IT Lösungen - flexibel und schnell
Web: http://www.quikit.at
Email: i...@quikit.at

"Enjoy your job, make lots of money, work within the law. Choose any two."


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Larry Leszczynski  
View profile  
 More options May 30 2010, 2:17 pm
From: "Larry Leszczynski" <lar...@emailplus.org>
Date: Sun, 30 May 2010 12:17:12 -0600
Local: Sun, May 30 2010 2:17 pm
Subject: Re: utf8 patch

On Sun, 30 May 2010 05:58 -0700, "Jonathan Swartz" <swa...@pobox.com>
wrote:

> CHI's role is not to interpret the data in any way, merely to store  
> and retrieve it and make sure it doesn't change in that process. In  
> that case, isn't it the right thing to make sure the utf8 flag is set  
> exactly the same after a store and retrieve?

That would be my expectation - my application-level code should work the
same way with caching turned on or turned off.  In the typical use case
you're going to call some API to get some data, and under the covers
that data might be coming from the cache, or might be coming e.g.
directly from a database (and also stored in the cache as a side
effect).  So at the application level I would handle the data the same
way regardless, and expect CHI to be transparent.

Larry


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Jonathan Swartz  
View profile  
 More options May 30 2010, 2:45 pm
From: Jonathan Swartz <swa...@pobox.com>
Date: Sun, 30 May 2010 11:45:10 -0700
Local: Sun, May 30 2010 2:45 pm
Subject: Re: utf8 patch

Thanks for the insights, Roland.

When people store values in CHI, they expect to get the same thing out  
when they retrieve it; so, I believe preserving the value of the utf-8  
flag is the right thing to do.

As far as keys and namespaces, they have to be encoded if their utf-8  
flag is on; otherwise we will not be able to use them in directory  
names, filenames, logs, etc.

Given this, I don't see a need for a separate param (and I think this  
would cause CHI to be "thinking too much" about unicode), but feel  
free to convince me otherwise. In this case a counter-example would be  
helpful.

Jon

On May 30, 2010, at 7:51 AM, Roland Lammel wrote:


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Roland Lammel  
View profile  
 More options May 30 2010, 3:14 pm
From: Roland Lammel <r...@quikit.at>
Date: Sun, 30 May 2010 21:14:55 +0200
Local: Sun, May 30 2010 3:14 pm
Subject: Re: utf8 patch

Hi Jon,

I completely agree that CHI should not think too much (both in terms of
performance and simplicity). What I probably missed to point out in the
earlier post is, that we should just ensure, that when a key is presented to
CHI it should be stored in a way, that when another cache instance (meaning
another application or from another host) is hitting the cache. With the
policy to act on the utf8 flag set, this should do it, as data would still
be treated binary actually with the exception of perl having detected it to
be a utf8 string. So it would be a proper encoded key anyway.

Sounds good and simple, cheers

+rl

--
Roland Lammel
QuikIT - IT Lösungen - flexibel und schnell
Web: http://www.quikit.at
Email: i...@quikit.at

"Enjoy your job, make lots of money, work within the law. Choose any two."


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Aristotle Pagaltzis  
View profile  
 More options Jun 2 2010, 5:40 am
From: Aristotle Pagaltzis <pagalt...@gmx.de>
Date: Wed, 2 Jun 2010 11:40:45 +0200
Local: Wed, Jun 2 2010 5:40 am
Subject: Re: utf8 patch
* Jonathan Swartz <swa...@pobox.com> [2010-05-30 15:00]:

> I'm not so sure. I know that using is_utf8 is generally Wrong.
> But CHI's role is not to interpret the data in any way, merely
> to store and retrieve it and make sure it doesn't change in
> that process. In that case, isn't it the right thing to make
> sure the utf8 flag is set exactly the same after a store and
> retrieve?

Is it? If I store some data under the key `"naïve"`, once with
UTF8 flag turned off and once with the flag on, am I storing the
data under two different keys, or the same key? And if it *is*
considered the same key, and I ask for what that key is, should
I get it with UTF8 flag on (that was how it was first stored) or
off (as it was off in the latest write)?

Note that because Perl considers the flag a transparent internal
implementation detail, it can easily happen that code used the
*same scalar variable* in both cases (flag off, flag on), just
because it also used the variable in some other operation in the
meantime that implicitly upgraded the string.

That’s fine in Perl – a string with UTF8 flag off and a string
with UTF8 flag on mean the same thing if the contain the same
sequence of logical characters even when their internal
representation differs.

But it means that you’d in turn be forcing client code to look at
the UTF8 flag to make sure it’s really passing what it thinks
it’s passing. And IMO any API which forces its clients to care
about the UTF8 flag is broken.

> For example, this script […] prints […] so Storable is
> preserving the is_utf8 flag.

Storable never interprets the data you pass it in any way
whatsoever. CHI does.

> JSON does the same thing.

Colour me dubious. There is no way to express the concept of
a UTF8 flag in JSON.

> We use Storable to serialize reference values, but we store
> scalar values raw, so we need to take responsibility for the
> utf8 flag in the scalar case.

I think the only sane thing to do is to consider all strings to
be strings – and in Perl there is no difference between character
strings and byte strings. You can slurp a JPEG image file into
a scalar, upgrade the scalar (turning on the UTF8 flag), and then
write the scalar back out to another file, and you’ll get the
very same JPEG image back, even though the UTF8 flag was turned
on in internal storage.

The UTF8 flag is misnamed. What it actually means is whether the
internal storage format of the string is a fixed-width packed
bytes array or a variable-width integer sequence. That’s all.

The REAL problem you have is that some of your cache backends can
cope with keys containing characters > 255 and some cannot.

I think it’s a bug in those backends when they cannot cope.

But you could decide to solve the problem centrally in CHI by
defining some kind of universal characters→bytes transliteration
scheme. As it happens, the UTF-8 encoding is a good choice for
such an encoding. In other words, you would encode ALL keys, no
matter whether the UTF8 flag is turned on or off, because the
flag does not change the semantic meaning of the string, ie. the
key `"naïve"` should yield the same encoded result regardless of
whether it was stored in a scalar with the UTF8 flag on or off.

This does however mean that backends which would be capable of
storing characters > 255 will only store the transliterated
versions, just like everyone else, so eg. if you use a DBI or DBM
backend then the data in the store will be harder to examine
because it will be stored in encoded form.

Regards,
--
Aristotle Pagaltzis // <http://plasmasturm.org/>


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Jonathan Swartz  
View profile  
 More options Jun 3 2010, 1:30 pm
From: Jonathan Swartz <swa...@pobox.com>
Date: Thu, 3 Jun 2010 10:30:35 -0700
Local: Thurs, Jun 3 2010 1:30 pm
Subject: Re: utf8 patch
 > Is it? If I store some data under the key `"naïve"`, once with
 > UTF8 flag turned off and once with the flag on, am I storing the
 > data under two different keys, or the same key? And if it *is*
 > considered the same key, and I ask for what that key is, should
 > I get it with UTF8 flag on (that was how it was first stored) or
 > off (as it was off in the latest write)?

Thanks for your message Aristotle. I still have a tenuous grasp of  
these issues, so I appreciate your advice and anything further you can  
provide!

Here's how the code looks now for keys and values.

KEYS

* Any key passed to a CHI operation (get, set, remove, etc.) is utf  
encoded iff its utf-8 flag is on.
* The encoding is a one-way operation. We don't record that the key  
was encoded, and get_keys does not attempt to decode it. (There is no  
real support in CHI for storing meta-data about keys.)

The rationale here is that
* I want to use utf-8 strings for keys even in drivers that can't  
handle wide characters
* I want to be able to pass the results of get_keys() back into get()  
and have it still retrieve the same object, without double-encoding it  
(though I realize this will break if someone calls get_keys(), then  
somehow turns the utf-8 flag back on before passing it back into get())
* I want to be backwards compatible with existing caches with binary  
string keys - thus I cannot encode all keys blindly

http://github.com/jonswar/perl-chi/blob/master/lib/CHI/Driver.pm#L500...

VALUES

* Any scalar value passed to set is encoded iff the utf-8 flag is on.
* The encoding is a two-way operation. We record the fact that the  
value was encoded, and we decode it when retrieving it from the cache.

The rationale here is that
* I want to be able to store utf-8 strings as values even in drivers  
that can't handle wide characters
* I want the values to come out exactly the same way as when they were  
stored
* I want to be backwards compatible with existing caches with binary  
string values - thus I cannot decode all values blindly

http://github.com/jonswar/perl-chi/blob/master/lib/CHI/CacheObject.pm...
http://github.com/jonswar/perl-chi/blob/master/lib/CHI/CacheObject.pm...

Here's a test class that attempts to confirm some of this:

     http://github.com/jonswar/perl-chi/blob/master/lib/CHI/t/Encode.pm

So. I'm consulting the utf-8 flag in both cases, even though I  
understand from all the docs that it is "wrong" to depend on this  
flag. But I can't figure out a better way to get the behavior and the  
backward compatibility that I want without consulting the flag.

Feedback welcome.

Jon


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Aristotle Pagaltzis  
View profile  
 More options Jun 4 2010, 12:10 pm
From: Aristotle Pagaltzis <pagalt...@gmx.de>
Date: Fri, 4 Jun 2010 18:10:31 +0200
Local: Fri, Jun 4 2010 12:10 pm
Subject: Re: utf8 patch
Hi Jonathan,

* Jonathan Swartz <swa...@pobox.com> [2010-06-03 19:30]:

that’s a problem.

Keys with only characters < 128 will always yield the same value
because their representation is the same regardless of the UTF8
flag, and keys with characters > 255 will also always yield the
same value because they can only be stored in strings with the
UTF8 flag on.

But for keys with characters in the 128..255 range, there are two
possible internal representations. So a string which contains
such characters will correspond to two different keys, depending
on its UTF8 flag. Different code paths that should access the
same key might therefor end up accessing different keys. This is,
to put it poetically, schizophrenic.

The right thing to do is to either always encode strings for use
as keys (= backends do not have to handle wide characters), or
never encode them (= backends have to decide for themselves how
to handle characters > 255) – rather than encoding them sometimes
and not encoding them other times.

Note that whichever of these changes you make, the only data that
will be affected by this change is data for which CHI already
handles in a schizophrenic fashion.

There is no sane solution to centralise the handling of big
characters in CHI if you are aiming for zero compatibility
breakage.

I would opt seriously opt for the null strategy: simply document
that backends are required to handle big characters in whichever
way they deem best for themselves, unless they tell CHI that it
should encode keys for them, in which case CHI would *always*
encode keys. This way, old backends that were broken WRT big
characters continue to be broken in exactly the same way as they
used to be, i.e. compatibility is automatic. New backends (or new
backend releases) would take this into account.

This is sane.

You get different results depending on whether the UTF8 flag is
on or off, but you also process them differently, so that it
cancels out on the bottom line.

> Here's a test class that attempts to confirm some of this:

>     http://github.com/jonswar/perl-chi/blob/master/lib/CHI/t/Encode.pm

> So. I'm consulting the utf-8 flag in both cases, even though
> I understand from all the docs that it is "wrong" to depend on
> this flag. But I can't figure out a better way to get the
> behavior and the backward compatibility that I want without
> consulting the flag.

There is no way to get both. Consulting the flag merely trades
one set of broken behaviours for another.

Regards,
--
Aristotle Pagaltzis // <http://plasmasturm.org/>


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Jonathan Swartz  
View profile  
 More options Jun 5 2010, 10:50 am
From: Jonathan Swartz <swa...@pobox.com>
Date: Sat, 5 Jun 2010 07:50:00 -0700
Subject: Re: utf8 patch

> that’s a problem.

> Keys with only characters < 128 will always yield the same value
> because their representation is the same regardless of the UTF8
> flag, and keys with characters > 255 will also always yield the
> same value because they can only be stored in strings with the
> UTF8 flag on.

> But for keys with characters in the 128..255 range, there are two
> possible internal representations. So a string which contains
> such characters will correspond to two different keys, depending
> on its UTF8 flag. Different code paths that should access the
> same key might therefor end up accessing different keys. This is,
> to put it poetically, schizophrenic.

Yes, I see the problem.

Ok, one more try: What if I only encoded strings that contained wide  
characters? e.g.

    if (is_utf8($key) && $key =~ /[^\x00-\xFF]/) {
        encode(utf8 => $key);
    }

Then there is no way for a key with characters in the 128..255 to be  
stored as two different keys.

I know that it seems simpler and more correct to encode all keys. But  
if I do that, I have to decode them all on the way back in (otherwise  
I'll get double-encoding when people pass the results of get_keys() or  
get_object()->key() back into CHI), which is undesirable (I have to  
capture and filter all calls that return keys).

Letting the backends take care of this themselves will either amount  
to the same thing, or result in inconsistent behavior.

> Note that whichever of these changes you make, the only data that
> will be affected by this change is data for which CHI already
> handles in a schizophrenic fashion.

I don't see that - right now, keys with chars in the 128..255 range  
are always handled as binary chars.

Thanks
Jon


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
Jonathan Swartz  
View profile  
 More options Jun 7 2010, 1:11 am
From: Jonathan Swartz <swa...@pobox.com>
Date: Sun, 6 Jun 2010 22:11:50 -0700
Local: Mon, Jun 7 2010 1:11 am
Subject: Re: utf8 patch
Another note, both Cache::FastMmap and DBD::SQLite seem to have this  
"schizoprenia", as you put it.

    #!/usr/bin/perl -w
    use Cache::FastMmap;
    use Carp::Assert;
    use DBI;
    use DBD::SQLite;
    use strict;

    my $binary_off = chr(129);
    my $binary_on  = substr($binary_off . "\x{263a}", 0,  
length($binary_off));
    assert($binary_off eq $binary_on);

    print "** sqlite **\n";
    unlink("sqlite.dat");
    my $dbh = DBI->connect("dbi:SQLite:dbname=sqlite.dat","","");
    $dbh->do("create table foo (key text)");
    my $sth = $dbh->do("insert into foo values (?)", {}, $binary_off);
    print "binary_off: " . $dbh->selectcol_arrayref("select count(*)  
from foo where key = ?", {}, $binary_off)->[0] . "\n";
    print "binary_on: " . $dbh->selectcol_arrayref("select count(*)  
from foo where key = ?", {}, $binary_on)->[0] . "\n";

    print "** fastmmap **\n";
    my $cache = Cache::FastMmap->new();
    $cache->set($binary_off, 5);
    print "binary_off: " . defined($cache->get($binary_off)) . "\n";
    print "binary_on: " . defined($cache->get($binary_on)) . "\n";

This prints

   ** sqlite **
    binary_off: 1
    binary_on: 0
    ** fastmmap **
    binary_off: 1
    binary_on:

Meaning that, even though $binary_off eq $binary_on, both sqlite and  
fastmmap treat them as distinct.

Jon

On Jun 5, 2010, at 7:50 AM, Jonathan Swartz wrote:


 
You must Sign in before you can post messages.
To post a message you must first join this group.
Please update your nickname on the subscription settings page before posting.
You do not have the permission required to post.
End of messages
« Back to Discussions « Newer topic     Older topic »