utf8 patch

jira

unread,

May 23, 2010, 8:55:43 AM5/23/10

to Perl-Cache Discuss

Hello,

lately I wrote about problem with caching utf8 strings. I then said
the problems were outside CHI. Well, actually I was wrong.
Now I found some time to dig into it.
The situation is as follows: if you cache utf8 string (the utf8 flag
on) then some drivers won't store it at all(
File driver gives the "wide character in syswrite error".
Other drivers will store it but will choke on get because, they will
try to deseralize it even as it was not serialized. That is due to the
error in unpack metadata, where the given length on metadata is
different for byte vs character semantics.

My suggested solution is to do encode on all input data (keys and
values) and decode it back on output.

Below is my suggested patch. I'm not at all sure it is the best way to
do it, bu it seems to work.

Regards,
Jiri

------------------------------------------------------------------------------------------------------------------------------------------------------

diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm
CHI/CacheObject.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm 2010-05-22
23:55:18.000000000 +0200
+++ CHI/CacheObject.pm 2010-05-22 23:41:12.000000000 +0200
@@ -3,6 +3,8 @@
use strict;
use warnings;

+use CHI::Util qw(_decode_utf8 _encode_utf8);
+
use constant f_key => 0;
use constant f_raw_value => 1;
use constant f_serializer => 2;
@@ -121,7 +123,7 @@
if ( $self->[f_is_serialized] ) {
$value = $self->serializer->deserialize($value);
}
- $self->[f_value] = $value;
+ $self->[f_value] = _decode_utf8 $value;
}
return $self->[f_value];
}
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm CHI/
Driver.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm 2010-05-22
23:55:18.000000000 +0200
+++ CHI/Driver.pm 2010-05-23 14:38:49.000000000 +0200
@@ -8,7 +8,7 @@
use CHI::Driver::Role::IsSubcache;
use CHI::Driver::Role::Universal;
use CHI::Serializer::Storable;
-use CHI::Util qw(has_moose_class parse_duration);
+use CHI::Util qw(has_moose_class parse_duration _decode_utf8
_encode_utf8);
use CHI::Types;
use Log::Any qw($log);
use Moose;
@@ -18,6 +18,7 @@
use strict;
use warnings;

+
my $default_serializer = CHI::Serializer::Storable->new();

has 'chi_root_class' => ( is => 'ro' );
@@ -143,6 +144,8 @@
croak "must specify key" unless defined($key);
my $ns_stats = $self->{ns_stats};

+ $key = _encode_utf8($key);
+
# Fetch cache object
#
my $data = $params{data};
@@ -212,6 +215,7 @@
sub get_expires_at {
my ( $self, $key ) = @_;
croak "must specify key" unless defined($key);
+ $key = _encode_utf8($key);

if ( my $obj = $self->get_object($key) ) {
return $obj->expires_at;
@@ -224,6 +228,7 @@
sub exists_and_is_expired {
my ( $self, $key ) = @_;
croak "must specify key" unless defined($key);
+ $key = _encode_utf8($key);

if ( my $obj = $self->get_object($key) ) {
return $obj->is_expired;
@@ -236,6 +241,7 @@
sub is_valid {
my ( $self, $key ) = @_;
croak "must specify key" unless defined($key);
+ $key = _encode_utf8($key);

if ( my $obj = $self->get_object($key) ) {
return !$obj->is_expired;
@@ -259,6 +265,9 @@
croak "must specify key" unless defined($key);
return unless defined($value);

+ $key = _encode_utf8($key);
+ $value = _encode_utf8($value);
+
# Fill in $options if not passed, copy if passed, and apply
defaults.
#
if ( !defined($options) ) {
@@ -341,6 +350,7 @@
sub expire {
my ( $self, $key ) = @_;
croak "must specify key" unless defined($key);
+ $key = _encode_utf8($key);

my $time = $Test_Time || time();
if ( defined( my $obj = $self->get_object($key) ) ) {
@@ -356,7 +366,7 @@
my ( $self, $key, $code ) = @_;
croak "must specify key and code" unless defined($key) &&
defined($code);

- if ( my $obj = $self->get_object($key) ) {
+ if ( my $obj = $self->get_object(_encode_utf8 $key) ) {
my $retval = $code->($obj);
if ($retval) {
$self->expire($key);
@@ -383,7 +393,7 @@
sub fetch_multi_hashref {
my ( $self, $keys ) = @_;

- return { map { ( $_, $self->fetch($_) ) } @$keys };
+ return { map { ( $_, $self->fetch(_encode_utf8 $_) ) } @$keys };
}

sub get_multi_hashref {
@@ -422,7 +432,7 @@
croak "must specify keys" unless defined($keys);

foreach my $key (@$keys) {
- $self->remove($key);
+ $self->remove(_encode_utf8 $key);
}
}

@@ -442,7 +452,7 @@
my %hash;
foreach my $key ( $self->get_keys() ) {
if ( defined( my $value = $self->get($key) ) ) {
- $hash{$key} = $value;
+ $hash{_decode_utf8 $key} = $value;
}
}
return \%hash;
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/Util.pm CHI/
Util.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/Util.pm 2010-05-22
23:55:18.000000000 +0200
+++ CHI/Util.pm 2010-05-22 23:38:09.000000000 +0200
@@ -5,6 +5,7 @@
use Fcntl qw( :DEFAULT );
use File::Spec::Functions qw(catdir catfile);
use Time::Duration::Parse;
+use Encode;
use strict;
use warnings;
use base qw(Exporter);
@@ -20,6 +21,8 @@
read_dir
unique_id
write_file
+ _decode_utf8
+ _encode_utf8
);

my $Fetch_Flags = O_RDONLY | O_BINARY;
@@ -145,6 +148,30 @@
return ( defined $meta && $meta->isa("Moose::Meta::Class") );
}

+# stolen from CGI.pm
+sub _decode_utf8 {
+ my $val = shift;
+
+ if (Encode::is_utf8($val)) {
+ return $val;
+ }
+ else {
+ return Encode::decode(utf8 => $val);
+ }
+}
+
+sub _encode_utf8 {
+ my $val = shift;
+
+ if (Encode::is_utf8($val)) {
+ return Encode::encode(utf8 => $val);
+ }
+ else {
+ return $val;
+ }
+}
+
+
1;

__END__

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cach...@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-disc...@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.

Jonathan Swartz

unread,

May 23, 2010, 9:55:51 AM5/23/10

to perl-cach...@googlegroups.com

Thanks Jiri. To be honest when you said the problem was outside CHI I
didn't quite believe it either, because a few other people have
complained - I just haven't taken the time to figure out what to do
about it. :)

Can a second person who understands encoding vouch for Jiri's approach
- not necessarily the exact implementation, but the concept of
encoding keys and values as they come in and decoding them as they go
out?

Jon

Tim Bunce

unread,

May 23, 2010, 9:56:20 AM5/23/10

to perl-cach...@googlegroups.com

I've no idea how appropriate this patch is or what the issues are
(I'm not paying attention) but from a performance perspective
I'd be concerned.

I'd rewrite these more like:

On Sun, May 23, 2010 at 05:55:43AM -0700, jira wrote:
> +sub _decode_utf8 {
> + my $val = shift;
> +
> + if (Encode::is_utf8($val)) {
> + return $val;
> + }
> + else {
> + return Encode::decode(utf8 => $val);
> + }
> +}

my $utf8_enc = Encode::find_encoding('utf8');

sub _decode_utf8 {
return shift unless Encode::is_utf8($_[0]);
return $utf8_enc->decode(shift);
}

and similarly for _encode_utf8. But for better performance I'd
drop the functions put the logic directly where it's needed.

Tim.

Perrin Harkins

unread,

May 23, 2010, 1:03:23 PM5/23/10

to perl-cach...@googlegroups.com

On Sun, May 23, 2010 at 8:55 AM, jira <ji...@prosyn.org> wrote:
> The situation is as follows: if you cache utf8 string (the utf8 flag
> on) then some drivers won't store it at all(
> File driver gives the "wide character in syswrite error".
> Other drivers will store it but will choke on get because, they will
> try to deseralize it even as it was not serialized.

I'm guessing there are some which already handle it. It would be good
to allow skipping any CHI encoding on a per-driver basis, so that
drivers which already support unicode don't do extra work.

- Perrin

jira

unread,

May 23, 2010, 4:23:26 PM5/23/10

to Perl-Cache Discuss

Perrin Harkins wrote:
> On Sun, May 23, 2010 at 8:55 AM, jira <ji...@prosyn.org> wrote:
> > The situation is as follows: if you cache utf8 string (the utf8 flag
> > on) then some drivers won't store it at all(
> > File driver gives the "wide character in syswrite error".
> > Other drivers will store it but will choke on get because, they will
> > try to deseralize it even as it was not serialized.
>
> I'm guessing there are some which already handle it. It would be good
> to allow skipping any CHI encoding on a per-driver basis, so that
> drivers which already support unicode don't do extra work.

The problem is on CHI level not driver level. In the pack_to_data /
unpack_from_data methods at least.
As it concatenates the utf8 data with the metadata the whole string
gets upgraded to utf8 and then
one gets bogus values reading the metadata back.
So I think the drivers should not care about encoding at all. They
will just get passed bytes and thats it.
If we encode all the data, then it solves problems for the rest of the
code).

perlunifaq also suggests to encode/decode all data:
http://search.cpan.org/~jesse/perl-5.12.1/pod/perlunifaq.pod

As for the exact implementation there are sure many (and probably
better) ways.

Kind Regards,
Jiri

jira

unread,

May 23, 2010, 4:40:12 PM5/23/10

to Perl-Cache Discuss

Tim Bunce wrote:
> I've no idea how appropriate this patch is or what the issues are
> (I'm not paying attention) but from a performance perspective
> I'd be concerned.
>
> I'd rewrite these more like:
>
> On Sun, May 23, 2010 at 05:55:43AM -0700, jira wrote:
> > +sub _decode_utf8 {
> > + my $val = shift;
> > +
> > + if (Encode::is_utf8($val)) {
> > + return $val;
> > + }
> > + else {
> > + return Encode::decode(utf8 => $val);
> > + }
> > +}
>
> my $utf8_enc = Encode::find_encoding('utf8');
>
> sub _decode_utf8 {
> return shift unless Encode::is_utf8($_[0]);
> return $utf8_enc->decode(shift);
> }
>
> and similarly for _encode_utf8. But for better performance I'd
> drop the functions put the logic directly where it's needed.

Thanks, this is indeed much faster.
Just to be precise the logic should be inverted:

sub _decode_utf8 {
return shift if Encode::is_utf8($_[0]);
return $utf8_enc->decode(shift);
}

jira

unread,

May 23, 2010, 8:09:25 PM5/23/10

to Perl-Cache Discuss

> As for the exact implementation there are sure many (and probably
> better) ways.
>

My patch indeed was a crap. It blindely encoded/decoded all values
even if they were references. So here is it corrected.
You will probably take a different route, but just to correct myself.

Regards,
Jiri

--------------------------------------------------------------------------------------------------------------------------------------------
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm
CHI/CacheObject.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/CacheObject.pm 2010-05-22
23:55:18.000000000 +0200

+++ CHI/CacheObject.pm 2010-05-24 02:02:00.000000000 +0200

@@ -3,6 +3,8 @@
use strict;
use warnings;

+use CHI::Util qw(_decode_utf8 _encode_utf8);
+
use constant f_key => 0;
use constant f_raw_value => 1;
use constant f_serializer => 2;

@@ -52,6 +54,9 @@
$raw_value = $serializer->serialize($raw_value);
$is_serialized = 1;
}
+ else {
+ $raw_value = _encode_utf8($raw_value);
+ }

# Not sure where this should be set and checked
#
@@ -121,7 +126,10 @@

if ( $self->[f_is_serialized] ) {
$value = $self->serializer->deserialize($value);
}
- $self->[f_value] = $value;

+ else {
+ $value = _decode_utf8($value);
+ }
+ $self->[f_value] = $value;

}
return $self->[f_value];
}
diff -uNr -x '*~' /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm CHI/
Driver.pm
--- /usr/lib/perl5/site_perl/5.8.8/CHI/Driver.pm 2010-05-22
23:55:18.000000000 +0200

+++ CHI/Driver.pm 2010-05-24 02:00:48.000000000 +0200

@@ -259,6 +265,8 @@

croak "must specify key" unless defined($key);
return unless defined($value);

+ $key = _encode_utf8($key);
+

# Fill in $options if not passed, copy if passed, and apply
defaults.
#
if ( !defined($options) ) {

@@ -341,6 +349,7 @@

sub expire {
my ( $self, $key ) = @_;
croak "must specify key" unless defined($key);
+ $key = _encode_utf8($key);

my $time = $Test_Time || time();
if ( defined( my $obj = $self->get_object($key) ) ) {

@@ -356,7 +365,7 @@

my ( $self, $key, $code ) = @_;
croak "must specify key and code" unless defined($key) &&
defined($code);

- if ( my $obj = $self->get_object($key) ) {
+ if ( my $obj = $self->get_object(_encode_utf8 $key) ) {
my $retval = $code->($obj);
if ($retval) {
$self->expire($key);

@@ -383,7 +392,7 @@

sub fetch_multi_hashref {
my ( $self, $keys ) = @_;

- return { map { ( $_, $self->fetch($_) ) } @$keys };
+ return { map { ( $_, $self->fetch(_encode_utf8 $_) ) } @$keys };
}

sub get_multi_hashref {

@@ -422,7 +431,7 @@

croak "must specify keys" unless defined($keys);

foreach my $key (@$keys) {
- $self->remove($key);
+ $self->remove(_encode_utf8 $key);
}
}

@@ -442,7 +451,7 @@

Aristotle Pagaltzis

unread,

May 29, 2010, 8:42:57 PM5/29/10

to perl-cach...@googlegroups.com

* Jonathan Swartz <swa...@pobox.com> [2010-05-23 15:55]:

> Can a second person who understands encoding vouch for Jiri's
> approach - not necessarily the exact implementation, but the
> concept of encoding keys and values as they come in and
> decoding them as they go out?

His patch is not sane. The utility functions look at the UTF8
flag to decide what to do, which is a broken approach, by
definition. (The UTF8 flag signifies the internal format of the
byte buffer of the string, but it says nothing about whether
a string consists of characters or bytes.) It will hide some
problems but cause others.

I do not have enough knowledge of CHI to give good advice on how
to proceed, however.

How many 3rd party drivers exist? How important is it that a new
version of CHI which fixes this problem be backward-compatible
with old drivers? (It will probably require changes to the driver
interface to fix it correctly.)

Regards,
--
Aristotle Pagaltzis // <http://plasmasturm.org/>

Jonathan Swartz

unread,

May 30, 2010, 8:58:14 AM5/30/10

to perl-cach...@googlegroups.com

On May 29, 2010, at 5:42 PM, Aristotle Pagaltzis wrote:

> * Jonathan Swartz <swa...@pobox.com> [2010-05-23 15:55]:
>> Can a second person who understands encoding vouch for Jiri's
>> approach - not necessarily the exact implementation, but the
>> concept of encoding keys and values as they come in and
>> decoding them as they go out?
>
> His patch is not sane. The utility functions look at the UTF8
> flag to decide what to do, which is a broken approach, by
> definition. (The UTF8 flag signifies the internal format of the
> byte buffer of the string, but it says nothing about whether
> a string consists of characters or bytes.) It will hide some
> problems but cause others.
>

I'm not so sure. I know that using is_utf8 is generally Wrong. But
CHI's role is not to interpret the data in any way, merely to store
and retrieve it and make sure it doesn't change in that process. In
that case, isn't it the right thing to make sure the utf8 flag is set
exactly the same after a store and retrieve?

For example, this script

#!/usr/bin/perl -w
use Encode;
use Storable qw(freeze thaw);
use strict;

my ($in, $out);

$in = "\x{263a}b";
$out = thaw(freeze([$in]))->[0];
print "is_utf8 before Storable: " . (Encode::is_utf8($in) ? 't' :
'f') . "\n";
print "is_utf8 after Storable: " . (Encode::is_utf8($out) ? 't' :
'f') . "\n\n";

$in = join('', map { chr($_) } (226, 152, 186, 98));
$out = thaw(freeze([$in]))->[0];
print "is_utf8 before Storable: " . (Encode::is_utf8($in) ? 't' :
'f') . "\n";
print "is_utf8 after Storable: " . (Encode::is_utf8($out) ? 't' :
'f') . "\n\n";

prints

is_utf8 before Storable: t
is_utf8 after Storable: t

is_utf8 before Storable: f
is_utf8 after Storable: f

so Storable is preserving the is_utf8 flag. JSON does the same thing.
We use Storable to serialize reference values, but we store scalar
values raw, so we need to take responsibility for the utf8 flag in the
scalar case.

Aristotle, please help me understand your objections better.

Thanks
Jon

jira

unread,

May 30, 2010, 9:44:29 AM5/30/10

to Perl-Cache Discuss

Jonathan Swartz wrote:
> On May 29, 2010, at 5:42 PM, Aristotle Pagaltzis wrote:
>
> > * Jonathan Swartz <swa...@pobox.com> [2010-05-23 15:55]:
> >> Can a second person who understands encoding vouch for Jiri's
> >> approach - not necessarily the exact implementation, but the
> >> concept of encoding keys and values as they come in and
> >> decoding them as they go out?
> >
> > His patch is not sane. The utility functions look at the UTF8
> > flag to decide what to do, which is a broken approach, by
> > definition. (The UTF8 flag signifies the internal format of the
> > byte buffer of the string, but it says nothing about whether
> > a string consists of characters or bytes.) It will hide some
> > problems but cause others.
> >
>
> I'm not so sure. I know that using is_utf8 is generally Wrong. But
> CHI's role is not to interpret the data in any way, merely to store
> and retrieve it and make sure it doesn't change in that process. In
> that case, isn't it the right thing to make sure the utf8 flag is set
> exactly the same after a store and retrieve?

Hello,

admittedly, I'm not super knowledgeable about the perl Unicode and
probably should not have attempted the patch at all.
But I did some amount of reading and looking at what other code does
and I think the general approach I suggested is correct.

perlunicode says:
"When Perl exchanges data with an extension, the extension should be
able to understand the UTF-8 flag and act accordingly."

And basically it suggests the strategy I used. And that is also what I
do in my code to circumvent the problem
(encode/decode the strings).

Also, from what I've read it seems to me that the utf8 flag actually
does signifies character semantics. But I might be just confused.

Kind Regards,
Jiri

Roland Lammel

unread,

May 30, 2010, 10:51:07 AM5/30/10

to perl-cach...@googlegroups.com

Hi all,

Bein no export on Perl Unicode, I had to cope with UTF-8 data for quite some time now. The most important rule from my experience (and I heard some echos of that on some lists) is that you should make sure to decode when reading/receiving data from external resources (network, file, database) and make sure the encode when sending/writing data to external resources (external here means not under your direct control anymore). The only exception here is if your code should act completely transparent here or performance requires some tricks here. Of course defining what external means and where external starts is the hard part.

So for CHI this would mean, we need to consider keys and values here. External would be the border where the writing to the backend occurs. As CHI is a pure layer for adding cache functionality and it knows nothing about the actaul semantic of the keys or values it needs to be hinted to what the data actually is (meaning binary or encoded string). A way to deal with that would be to allow to specify params the CHI to specify whether keys and or values should be considered strings or binaries. Considering backwards compatibility that would mean be binary by default, so the behaviour of CHI would not change and introduce an encoding param (either just string with automagic UTF-8 handling similar to the patch proposed or specify the encoding to use).

This could allow dwimity, proper handling of encoding and backwards compatibility by still allowing pure binary data as key/value. The downside would be addtionial options when initializing CHI and adding a little complexity the the core/drivers.

Thoughs?

Cheers

+rl

--
You received this message because you are subscribed to the Google Groups "Perl-Cache Discuss" group.
To post to this group, send email to perl-cach...@googlegroups.com.
To unsubscribe from this group, send email to perl-cache-disc...@googlegroups.com.
For more options, visit this group at http://groups.google.com/group/perl-cache-discuss?hl=en.

--
Roland Lammel
QuikIT - IT Lösungen - flexibel und schnell
Web: http://www.quikit.at
Email: in...@quikit.at

"Enjoy your job, make lots of money, work within the law. Choose any two."

Larry Leszczynski

unread,

May 30, 2010, 2:17:12 PM5/30/10

to perl-cach...@googlegroups.com

On Sun, 30 May 2010 05:58 -0700, "Jonathan Swartz" <swa...@pobox.com>
wrote:

>
> CHI's role is not to interpret the data in any way, merely to store
> and retrieve it and make sure it doesn't change in that process. In
> that case, isn't it the right thing to make sure the utf8 flag is set
> exactly the same after a store and retrieve?

That would be my expectation - my application-level code should work the
same way with caching turned on or turned off. In the typical use case
you're going to call some API to get some data, and under the covers
that data might be coming from the cache, or might be coming e.g.
directly from a database (and also stored in the cache as a side
effect). So at the application level I would handle the data the same
way regardless, and expect CHI to be transparent.

Larry

Jonathan Swartz

unread,

May 30, 2010, 2:45:10 PM5/30/10

to perl-cach...@googlegroups.com

Thanks for the insights, Roland.

When people store values in CHI, they expect to get the same thing out when they retrieve it; so, I believe preserving the value of the utf-8 flag is the right thing to do.

As far as keys and namespaces, they have to be encoded if their utf-8 flag is on; otherwise we will not be able to use them in directory names, filenames, logs, etc.

Given this, I don't see a need for a separate param (and I think this would cause CHI to be "thinking too much" about unicode), but feel free to convince me otherwise. In this case a counter-example would be helpful.

Jon

Roland Lammel

unread,

May 30, 2010, 3:14:55 PM5/30/10

to perl-cach...@googlegroups.com

Hi Jon,

I completely agree that CHI should not think too much (both in terms of performance and simplicity). What I probably missed to point out in the earlier post is, that we should just ensure, that when a key is presented to CHI it should be stored in a way, that when another cache instance (meaning another application or from another host) is hitting the cache. With the policy to act on the utf8 flag set, this should do it, as data would still be treated binary actually with the exception of perl having detected it to be a utf8 string. So it would be a proper encoded key anyway.

Sounds good and simple, cheers

+rl

Aristotle Pagaltzis

unread,

Jun 2, 2010, 5:40:45 AM6/2/10

to perl-cach...@googlegroups.com

* Jonathan Swartz <swa...@pobox.com> [2010-05-30 15:00]:

> I'm not so sure. I know that using is_utf8 is generally Wrong.
> But CHI's role is not to interpret the data in any way, merely
> to store and retrieve it and make sure it doesn't change in
> that process. In that case, isn't it the right thing to make
> sure the utf8 flag is set exactly the same after a store and
> retrieve?

Is it? If I store some data under the key `"naïve"`, once with
UTF8 flag turned off and once with the flag on, am I storing the
data under two different keys, or the same key? And if it *is*
considered the same key, and I ask for what that key is, should
I get it with UTF8 flag on (that was how it was first stored) or
off (as it was off in the latest write)?

Note that because Perl considers the flag a transparent internal
implementation detail, it can easily happen that code used the
*same scalar variable* in both cases (flag off, flag on), just
because it also used the variable in some other operation in the
meantime that implicitly upgraded the string.

That’s fine in Perl – a string with UTF8 flag off and a string
with UTF8 flag on mean the same thing if the contain the same
sequence of logical characters even when their internal
representation differs.

But it means that you’d in turn be forcing client code to look at
the UTF8 flag to make sure it’s really passing what it thinks
it’s passing. And IMO any API which forces its clients to care
about the UTF8 flag is broken.

> For example, this script […] prints […] so Storable is
> preserving the is_utf8 flag.

Storable never interprets the data you pass it in any way
whatsoever. CHI does.

> JSON does the same thing.

Colour me dubious. There is no way to express the concept of
a UTF8 flag in JSON.

> We use Storable to serialize reference values, but we store
> scalar values raw, so we need to take responsibility for the
> utf8 flag in the scalar case.

I think the only sane thing to do is to consider all strings to
be strings – and in Perl there is no difference between character
strings and byte strings. You can slurp a JPEG image file into
a scalar, upgrade the scalar (turning on the UTF8 flag), and then
write the scalar back out to another file, and you’ll get the
very same JPEG image back, even though the UTF8 flag was turned
on in internal storage.

The UTF8 flag is misnamed. What it actually means is whether the
internal storage format of the string is a fixed-width packed
bytes array or a variable-width integer sequence. That’s all.

The REAL problem you have is that some of your cache backends can
cope with keys containing characters > 255 and some cannot.

I think it’s a bug in those backends when they cannot cope.

But you could decide to solve the problem centrally in CHI by
defining some kind of universal characters→bytes transliteration
scheme. As it happens, the UTF-8 encoding is a good choice for
such an encoding. In other words, you would encode ALL keys, no
matter whether the UTF8 flag is turned on or off, because the
flag does not change the semantic meaning of the string, ie. the
key `"naïve"` should yield the same encoded result regardless of
whether it was stored in a scalar with the UTF8 flag on or off.

This does however mean that backends which would be capable of
storing characters > 255 will only store the transliterated
versions, just like everyone else, so eg. if you use a DBI or DBM
backend then the data in the store will be harder to examine
because it will be stored in encoded form.

Jonathan Swartz

unread,

Jun 3, 2010, 1:30:35 PM6/3/10

to perl-cach...@googlegroups.com

> Is it? If I store some data under the key `"naïve"`, once with
> UTF8 flag turned off and once with the flag on, am I storing the
> data under two different keys, or the same key? And if it *is*
> considered the same key, and I ask for what that key is, should
> I get it with UTF8 flag on (that was how it was first stored) or
> off (as it was off in the latest write)?

Thanks for your message Aristotle. I still have a tenuous grasp of
these issues, so I appreciate your advice and anything further you can
provide!

Here's how the code looks now for keys and values.

KEYS

* Any key passed to a CHI operation (get, set, remove, etc.) is utf
encoded iff its utf-8 flag is on.
* The encoding is a one-way operation. We don't record that the key
was encoded, and get_keys does not attempt to decode it. (There is no
real support in CHI for storing meta-data about keys.)

The rationale here is that
* I want to use utf-8 strings for keys even in drivers that can't
handle wide characters
* I want to be able to pass the results of get_keys() back into get()
and have it still retrieve the same object, without double-encoding it
(though I realize this will break if someone calls get_keys(), then
somehow turns the utf-8 flag back on before passing it back into get())
* I want to be backwards compatible with existing caches with binary
string keys - thus I cannot encode all keys blindly

http://github.com/jonswar/perl-chi/blob/master/lib/CHI/Driver.pm#L500-502

VALUES

* Any scalar value passed to set is encoded iff the utf-8 flag is on.
* The encoding is a two-way operation. We record the fact that the
value was encoded, and we decode it when retrieving it from the cache.

The rationale here is that
* I want to be able to store utf-8 strings as values even in drivers
that can't handle wide characters
* I want the values to come out exactly the same way as when they were
stored
* I want to be backwards compatible with existing caches with binary
string values - thus I cannot decode all values blindly

http://github.com/jonswar/perl-chi/blob/master/lib/CHI/CacheObject.pm#L60-62
http://github.com/jonswar/perl-chi/blob/master/lib/CHI/CacheObject.pm#L131-132

Here's a test class that attempts to confirm some of this:

http://github.com/jonswar/perl-chi/blob/master/lib/CHI/t/Encode.pm

So. I'm consulting the utf-8 flag in both cases, even though I
understand from all the docs that it is "wrong" to depend on this
flag. But I can't figure out a better way to get the behavior and the
backward compatibility that I want without consulting the flag.

Feedback welcome.

Jon

Aristotle Pagaltzis

unread,

Jun 4, 2010, 12:10:31 PM6/4/10

to perl-cach...@googlegroups.com

Hi Jonathan,

* Jonathan Swartz <swa...@pobox.com> [2010-06-03 19:30]:

> KEYS
>
> * Any key passed to a CHI operation (get, set, remove, etc.) is
> utf encoded iff its utf-8 flag is on.
> * The encoding is a one-way operation. We don't record that the
> key was encoded, and get_keys does not attempt to decode it.
> (There is no real support in CHI for storing meta-data about
> keys.)
>
> The rationale here is that
> * I want to use utf-8 strings for keys even in drivers that
> can't handle wide characters
> * I want to be able to pass the results of get_keys() back into
> get() and have it still retrieve the same object, without
> double-encoding it (though I realize this will break if
> someone calls get_keys(), then somehow turns the utf-8 flag
> back on before passing it back into get())
> * I want to be backwards compatible with existing caches with
> binary string keys - thus I cannot encode all keys blindly
>
> http://github.com/jonswar/perl-chi/blob/master/lib/CHI/Driver.pm#L500-502

that’s a problem.

Keys with only characters < 128 will always yield the same value
because their representation is the same regardless of the UTF8
flag, and keys with characters > 255 will also always yield the
same value because they can only be stored in strings with the
UTF8 flag on.

But for keys with characters in the 128..255 range, there are two
possible internal representations. So a string which contains
such characters will correspond to two different keys, depending
on its UTF8 flag. Different code paths that should access the
same key might therefor end up accessing different keys. This is,
to put it poetically, schizophrenic.

The right thing to do is to either always encode strings for use
as keys (= backends do not have to handle wide characters), or
never encode them (= backends have to decide for themselves how
to handle characters > 255) – rather than encoding them sometimes
and not encoding them other times.

Note that whichever of these changes you make, the only data that
will be affected by this change is data for which CHI already
handles in a schizophrenic fashion.

There is no sane solution to centralise the handling of big
characters in CHI if you are aiming for zero compatibility
breakage.

I would opt seriously opt for the null strategy: simply document
that backends are required to handle big characters in whichever
way they deem best for themselves, unless they tell CHI that it
should encode keys for them, in which case CHI would *always*
encode keys. This way, old backends that were broken WRT big
characters continue to be broken in exactly the same way as they
used to be, i.e. compatibility is automatic. New backends (or new
backend releases) would take this into account.

> VALUES
>
> * Any scalar value passed to set is encoded iff the utf-8 flag
> is on.
> * The encoding is a two-way operation. We record the fact that
> the value was encoded, and we decode it when retrieving it
> from the cache.
>
> The rationale here is that
> * I want to be able to store utf-8 strings as values even in
> drivers that can't handle wide characters
> * I want the values to come out exactly the same way as when
> they were stored
> * I want to be backwards compatible with existing caches with
> binary string values - thus I cannot decode all values blindly
>
> http://github.com/jonswar/perl-chi/blob/master/lib/CHI/CacheObject.pm#L60-62
> http://github.com/jonswar/perl-chi/blob/master/lib/CHI/CacheObject.pm#L131-132

This is sane.

You get different results depending on whether the UTF8 flag is
on or off, but you also process them differently, so that it
cancels out on the bottom line.

> Here's a test class that attempts to confirm some of this:
>
> http://github.com/jonswar/perl-chi/blob/master/lib/CHI/t/Encode.pm
>
> So. I'm consulting the utf-8 flag in both cases, even though
> I understand from all the docs that it is "wrong" to depend on
> this flag. But I can't figure out a better way to get the
> behavior and the backward compatibility that I want without
> consulting the flag.

There is no way to get both. Consulting the flag merely trades
one set of broken behaviours for another.

Jonathan Swartz

unread,

Jun 5, 2010, 10:50:00 AM6/5/10

to perl-cach...@googlegroups.com

> that’s a problem.
>
> Keys with only characters < 128 will always yield the same value
> because their representation is the same regardless of the UTF8
> flag, and keys with characters > 255 will also always yield the
> same value because they can only be stored in strings with the
> UTF8 flag on.
>
> But for keys with characters in the 128..255 range, there are two
> possible internal representations. So a string which contains
> such characters will correspond to two different keys, depending
> on its UTF8 flag. Different code paths that should access the
> same key might therefor end up accessing different keys. This is,
> to put it poetically, schizophrenic.
>

Yes, I see the problem.

Ok, one more try: What if I only encoded strings that contained wide
characters? e.g.

if (is_utf8($key) && $key =~ /[^\x00-\xFF]/) {
encode(utf8 => $key);
}

Then there is no way for a key with characters in the 128..255 to be
stored as two different keys.

I know that it seems simpler and more correct to encode all keys. But
if I do that, I have to decode them all on the way back in (otherwise
I'll get double-encoding when people pass the results of get_keys() or
get_object()->key() back into CHI), which is undesirable (I have to
capture and filter all calls that return keys).

Letting the backends take care of this themselves will either amount
to the same thing, or result in inconsistent behavior.

> Note that whichever of these changes you make, the only data that
> will be affected by this change is data for which CHI already
> handles in a schizophrenic fashion.

I don't see that - right now, keys with chars in the 128..255 range
are always handled as binary chars.

Thanks
Jon

Jonathan Swartz

unread,

Jun 7, 2010, 1:11:50 AM6/7/10

to perl-cach...@googlegroups.com

Another note, both Cache::FastMmap and DBD::SQLite seem to have this
"schizoprenia", as you put it.

#!/usr/bin/perl -w
use Cache::FastMmap;
use Carp::Assert;
use DBI;
use DBD::SQLite;
use strict;

my $binary_off = chr(129);
my $binary_on = substr($binary_off . "\x{263a}", 0,
length($binary_off));
assert($binary_off eq $binary_on);

print "** sqlite **\n";
unlink("sqlite.dat");
my $dbh = DBI->connect("dbi:SQLite:dbname=sqlite.dat","","");
$dbh->do("create table foo (key text)");
my $sth = $dbh->do("insert into foo values (?)", {}, $binary_off);
print "binary_off: " . $dbh->selectcol_arrayref("select count(*)
from foo where key = ?", {}, $binary_off)->[0] . "\n";
print "binary_on: " . $dbh->selectcol_arrayref("select count(*)
from foo where key = ?", {}, $binary_on)->[0] . "\n";

print "** fastmmap **\n";
my $cache = Cache::FastMmap->new();
$cache->set($binary_off, 5);
print "binary_off: " . defined($cache->get($binary_off)) . "\n";
print "binary_on: " . defined($cache->get($binary_on)) . "\n";

This prints

** sqlite **
binary_off: 1
binary_on: 0
** fastmmap **
binary_off: 1
binary_on:

Meaning that, even though $binary_off eq $binary_on, both sqlite and
fastmmap treat them as distinct.

Jon

Reply all

Reply to author

Forward