Revision: 453
Author: ralfoide
Date: Mon Aug 30 23:36:44 2010
Log: Switching back to ISO-8859-1 (Latin-1) processing workflow.
All internal process is now officially done in ISO-8859-1.
Output is fixed to UTF-8.
Fixed test runner to be able to specify tests to run.
http://code.google.com/p/rig3/source/detail?r=453
Modified:
/trunk/rig3serv/misc/Configuration.txt
/trunk/rig3serv/misc/TaskList.txt
/trunk/rig3serv/src/rig/cache.py
/trunk/rig3serv/src/rig/parser/izu_parser.py
/trunk/rig3serv/src/rig/site/site_default.py
/trunk/rig3serv/src/rig/site_base.py
/trunk/rig3serv/src/rig/sites_settings.py
/trunk/rig3serv/src/rig/source_item.py
/trunk/rig3serv/src/rig/source_reader.py
/trunk/rig3serv/src/rig/template/tag.py
/trunk/rig3serv/src/rig/version.py
/trunk/rig3serv/src/test_rig3.py
/trunk/rig3serv/testdata/album/blog1/2007-10-07_Folder 1/index.izu
/trunk/rig3serv/testdata/album/blog1/file_items/2007-09-09 Izu File
Item.izu
=======================================
--- /trunk/rig3serv/misc/Configuration.txt Sun Aug 29 21:32:32 2010
+++ /trunk/rig3serv/misc/Configuration.txt Mon Aug 30 23:36:44 2010
@@ -112,7 +112,9 @@
5- Encoding
----------------------
-By default rig3 considers input files to be UTF-8.
+By default rig3 considers input files to be Latin-1 (aka ISO-8859-1).
+
+Output files are always UTF-8.
You can override this at 3 levels:
- per site, using encoding=type in the .rc file
@@ -124,6 +126,13 @@
Full list is here:
http://docs.python.org/library/codecs.html#standard-encodings
+Internal processing is done in ISO-8859-1, no matter what the input
encoding is.
+You can change the input encoding, as long as content can be interpreted as
+ISO-8859-1. Unfortunately, this limitation cannot be easily lifted. However
+unknown characters will be output using their XML entity reference encoding
+(e.g. &#xhhhh; in hexa), which means as a workaround you could at least
+post-process it.
+
----------------------
6- Category Filter
=======================================
--- /trunk/rig3serv/misc/TaskList.txt Sun Aug 29 23:29:40 2010
+++ /trunk/rig3serv/misc/TaskList.txt Mon Aug 30 23:36:44 2010
@@ -38,6 +38,7 @@
---- Done Version 0.4 ----
+20100830 Engine: Switching back to ISO-8859-1 (Latin-1) processing
workflow.
20100829 Engine: Generate UTF-8 files by default. Full unicode workflow
support.
20100829 Engine: Switch Hashable and Cache from MD5 to SHA1 keys
20100829 Engine: Add support for encoding. Switch to explicit UTF-8 by
default.
=======================================
--- /trunk/rig3serv/src/rig/cache.py Sun Aug 29 23:29:40 2010
+++ /trunk/rig3serv/src/rig/cache.py Mon Aug 30 23:36:44 2010
@@ -278,7 +278,7 @@
os.unlink(p)
os.rmdir(dir_path)
except OSError, e:
- self.Log().Exception("RemoveDir '%s' failed: %s", dir_path, e)
+ self._log.Exception("RemoveDir '%s' failed: %s", dir_path, e)
#------------------------
# Local Variables:
=======================================
--- /trunk/rig3serv/src/rig/parser/izu_parser.py Sun Aug 29 22:17:07 2010
+++ /trunk/rig3serv/src/rig/parser/izu_parser.py Mon Aug 30 23:36:44 2010
@@ -146,6 +146,10 @@
if self._file:
try:
line = self._file.readline()
+ if isinstance(line, unicode):
+ # Internally we only process ISO-8859-1 and replace
+ # unknown entities by their XML hexa encoding
+ line = line.encode("iso-8859-1", "xmlcharrefreplace")
except UnicodeDecodeError, e:
raise Exception("Failed to read line from %s" %
self._filename,
"UnicodeDecodeError: " + str(e))
@@ -216,13 +220,17 @@
rel_file = None
if isinstance(filestream, (str, unicode)):
# open with 1=line buffered, U=universal end-of-lines
- f = codecs.open(filestream, mode="rU", buffering=1,
encoding=encoding)
+ f = codecs.open(filestream, mode="rU", buffering=1,
+ encoding=encoding,
+ errors="xmlcharrefreplace")
filename = filestream
elif isinstance(filestream, RelPath):
filename = filestream.abs_path
rel_file = filestream
# open with 1=line buffered, U=universal end-of-lines
- f = codecs.open(filename, mode="rU", buffering=1,
encoding=encoding)
+ f = codecs.open(filename, mode="rU", buffering=1,
+ encoding=encoding,
+ errors="xmlcharrefreplace")
else:
f = filestream
filename = "<internal stream>"
@@ -266,7 +274,11 @@
def ParseFileFirstLine(self, filename, encoding):
"""
- Parses the *first* line of a *file*, using the given optional
encoding.
+ Parses the *first* 2 lines of a *file*, using the given optional
encoding.
+
+ Note: actually we use 2 lines, as it's entirely possible that the
+ file needs the first line to be different, typically an XHTML file
+ might need to have an <xml> declaration on the first line.
This is a wrapper to open the file, extract the first line and
return the result of ParseFirstLine on it.
@@ -275,8 +287,15 @@
try:
if isinstance(filename, RelPath):
filename = filename.abs_path
- f = codecs.open(filename, mode="rU", buffering=1,
encoding=encoding)
- line = f.readline()
+ f = codecs.open(filename, mode="rU", buffering=1,
+ encoding=encoding, errors="xmlcharrefreplace")
+ line = f.readline() + f.readline()
+ if isinstance(line, unicode):
+ # Internally we only process ISO-8859-1 and replace
+ # unknown entities by their XML hexa encoding
+ line = line.encode("iso-8859-1", "xmlcharrefreplace")
+ # strip \r\n from it
+ line = re.sub("[\r\n]", "", line)
return self.ParseFirstLine(line)
finally:
if f:
@@ -574,7 +593,7 @@
if is_unicode:
us = line
else:
- us = line.decode("utf-8")
+ us = unicode(line, "utf-8")
for k, v in UTF8_ACCENTS_TO_HTML.iteritems():
if k in us:
=======================================
--- /trunk/rig3serv/src/rig/site/site_default.py Sun Aug 29 23:29:40 2010
+++ /trunk/rig3serv/src/rig/site/site_default.py Mon Aug 30 23:36:44 2010
@@ -53,8 +53,6 @@
def __repr__(self):
content = self.content
- if isinstance(content, unicode):
- content = content.encode("unicode_escape")
return "<%s: title %s, date %s, link %s, content %s>" % (
self.__class__.__name__, self.title, self.date,
self.permalink, content)
@@ -768,17 +766,17 @@
keywords):
sections = {}
tags = {}
+
+ if source_item.source_settings.encoding:
+ encoding = source_item.source_settings.encoding
+ else:
+ encoding = self._site_settings.encoding
if izu_file:
self._log.Info("[%s] Render '%s' to HTML",
self._site_settings.public_name,
izu_file)
- if source_item.source_settings.encoding:
- encoding = source_item.source_settings.encoding
- else:
- encoding = self._site_settings.encoding
-
p = IzuParser(self._log,
keywords["rig_base"],
keywords["img_gen_script"])
@@ -808,7 +806,7 @@
sections[k] = ""
elif html_file:
- sections["html"] = self._ReadFile(html_file)
+ sections["html"] = self._ReadFile(html_file, encoding)
izu_parser = IzuParser(self._log,
keywords["rig_base"],
keywords["img_gen_script"])
@@ -1138,7 +1136,7 @@
def _GetRating(self, ascii):
return self._RATING.get(ascii, self._RATING_DEFAULT)
- def _ReadFile(self, full_path):
+ def _ReadFile(self, full_path, encoding):
"""
Returns the content of a file as a string.
Will raise an IOError if the file cannot be read.
@@ -1146,9 +1144,20 @@
if isinstance(full_path, RelPath):
full_path = full_path.abs_path
- f = file(full_path)
+ # First line may have some izu tags
+ tags = IzuParser(self._log, None,
None).ParseFileFirstLine(full_path, encoding)
+ encoding = tags.get("encoding", encoding)
+
+ f = codecs.open(full_path, mode="rU",
+ encoding=encoding,
+ errors="xmlcharrefreplace")
data = f.read()
f.close()
+
+ # Internally we only process ISO-8859-1 and replace
+ # unknown entities by their XML hexa encoding
+ data = data.encode("iso-8859-1", "xmlcharrefreplace")
+
return data
def _WriteFile(self, data, dest_dir, leafname):
@@ -1179,8 +1188,12 @@
self._site_settings.public_name,
dest_file)
- f = codecs.open(dest_file, mode="wb", encoding="utf-8")
- f.write(data)
+ f = codecs.open(dest_file, mode="wb",
+ encoding="utf-8",
+ errors="xmlcharrefreplace")
+ # Internally the data was processed as iso-8859-1, so use that
+ # to decode and re-encode to UTF-8 for output.
+ f.write(data.decode("iso-8859-1"))
f.close()
return dest_file
=======================================
--- /trunk/rig3serv/src/rig/site_base.py Mon Oct 26 18:50:51 2009
+++ /trunk/rig3serv/src/rig/site_base.py Mon Aug 30 23:36:44 2010
@@ -101,11 +101,12 @@
try:
# Note: do not display content_gen as it's an internal pointer
that
# changes between runs.
+ title = self.title
return "[%s: %s, %s, %s, %s, %s]" % (
self.__class__.__name__,
self.source_item,
self.date,
- self.title,
+ title,
self.categories,
self.permalink)
except:
=======================================
--- /trunk/rig3serv/src/rig/sites_settings.py Sun Aug 29 21:19:47 2010
+++ /trunk/rig3serv/src/rig/sites_settings.py Mon Aug 30 23:36:44 2010
@@ -223,7 +223,7 @@
- enable_sharing(bool): When true, add links to share posts to
Facebook, twitter, etc.
- index_exclude(str): An inclusion-exclusion list of categories to
exclude from
the generic "all recents items" page.
- - encoding(str): Encoding of Izu/HTML text files. Default is UTF-8.
+ - encoding(str): Encoding of Izu/HTML text files. Default is Latin-1
(ISO-8859-1).
Can be overridden per source.
"""
def __init__(self,
@@ -266,7 +266,7 @@
youtube_sy="385",
enable_sharing=False,
index_exclude=IncludeExclude(IncludeExclude.ALL, None),
- encoding="utf-8"
+ encoding="iso-8859-1"
):
# Note: this is *always* called using the default values defined
in the
# constructor. If you need to change a setting loaded from an RC
file,
=======================================
--- /trunk/rig3serv/src/rig/source_item.py Sun Aug 29 23:29:40 2010
+++ /trunk/rig3serv/src/rig/source_item.py Mon Aug 30 23:36:44 2010
@@ -34,7 +34,7 @@
- rig_base(str): Base URL for rig1 for generating rig1 image links.
- encoding(str): Text encoding of Izu/HTML files for the source.
When set, overrides the global settings' encoding
- (which is utf-8 by default).
+ which is Latin-1 (ISO-8859-1) by default.
"""
def __init__(self, rig_base=None, encoding=None):
super(SourceSettings, self).__init__()
=======================================
--- /trunk/rig3serv/src/rig/source_reader.py Sun Aug 29 22:17:07 2010
+++ /trunk/rig3serv/src/rig/source_reader.py Mon Aug 30 23:36:44 2010
@@ -197,7 +197,9 @@
if "encoding" in tags:
tags = IzuParser(self._log, None,
None).ParseFileFirstLine(rel_file.abs_path, encoding)
- f = codecs.open(rel_file.abs_path, mode="rU", encoding=encoding)
+ f = codecs.open(rel_file.abs_path, mode="rU",
+ encoding=encoding,
+ errors="xmlcharrefreplace")
# Skip to the first section
for line in f:
if line.strip() == SEP:
@@ -215,6 +217,11 @@
date = None
title = None
for line in f:
+ if isinstance(line, unicode):
+ # Internally we only process ISO-8859-1 and replace
+ # unknown entities by their XML hexa encoding
+ line = line.encode("iso-8859-1", "xmlcharrefreplace")
+
if line.strip() == SEP:
continue
=======================================
--- /trunk/rig3serv/src/rig/template/tag.py Sun Aug 29 23:29:40 2010
+++ /trunk/rig3serv/src/rig/template/tag.py Mon Aug 30 23:36:44 2010
@@ -196,7 +196,8 @@
for value in result:
d = dict(context) # clone context before udpating it
d[var] = value
- s += content.Generate(log, d)
+ u = content.Generate(log, d)
+ s += u
return s
=======================================
--- /trunk/rig3serv/src/rig/version.py Sun Aug 29 23:29:40 2010
+++ /trunk/rig3serv/src/rig/version.py Mon Aug 30 23:36:44 2010
@@ -13,7 +13,7 @@
To enable substitutions, do something like this:
$ svn propset svn:keywords "Date Author Revision HeadURL Id" version.py
------
+----
Part of Rig3.
Copyright (C) 2007-2009 ralfoide gmail com
=======================================
--- /trunk/rig3serv/src/test_rig3.py Wed Sep 2 21:57:04 2009
+++ /trunk/rig3serv/src/test_rig3.py Mon Aug 30 23:36:44 2010
@@ -36,9 +36,9 @@
Overrides unittest.TestProgram to use our own test list if none is
specified on the command line.
"""
- def __init__(self, tests):
+ def __init__(self, tests, argv=None):
self._tests = tests
- unittest.TestProgram.__init__(self)
+ unittest.TestProgram.__init__(self, argv=argv)
def runTests(self):
RigTestCase.setVerbose(self.verbosity == 2)
@@ -48,7 +48,7 @@
#------------
-def get_tests():
+def get_tests(test_filter=None):
"""
Create a test suite with all the tests to run.
"""
@@ -72,7 +72,9 @@
m = list(base)
m.append(file)
m = m[start:]
- modules.append(".".join(m))
+ name = ".".join(m)
+ if (not test_filter) or name in test_filter:
+ modules.append(name)
print >>sys.stderr, "Rig Tests:", ", ".join([m.split(".")[-1] for m in
modules])
@@ -84,8 +86,18 @@
#------------------------
if __name__ == "__main__":
print >>sys.stderr, "UT Main", sys.argv
- tests = get_tests()
- p = CustomTestProgram(tests)
+
+ argv = []
+ test_filter = []
+ for a in sys.argv:
+ if isinstance(a, str) and a.startswith("test"):
+ test_filter.append(a)
+ else:
+ argv.append(a)
+
+ tests = get_tests(test_filter)
+
+ p = CustomTestProgram(tests, argv=argv)
#------------------------
=======================================
--- /trunk/rig3serv/testdata/album/blog1/2007-10-07_Folder 1/index.izu Sun
Aug 29 21:19:47 2010
+++ /trunk/rig3serv/testdata/album/blog1/2007-10-07_Folder 1/index.izu Mon
Aug 30 23:36:44 2010
@@ -18,6 +18,11 @@
Rig Image: [This is a rig image|rigimg:124:T12896*.jpg]
+This file is iso-8859-1 encoded
+and here's a funny accent to deal with: Månsson.
+
[s:fr]
Un mot ou deux en francais, avec des accents en ISO 8859-1:
ça, où est le pré près du prêt?
+
+
=======================================
--- /trunk/rig3serv/testdata/album/blog1/file_items/2007-09-09 Izu File
Item.izu Sun Aug 29 23:29:40 2010
+++ /trunk/rig3serv/testdata/album/blog1/file_items/2007-09-09 Izu File
Item.izu Mon Aug 30 23:36:44 2010
@@ -1,4 +1,4 @@
-[izu:cat:bar,foo,other]
+[izu:cat:bar,foo,other] [izu:encoding:utf-8]
You can have sections but it's not mandatory. The ''en'' one is the
default.
[s:en]
@@ -11,6 +11,8 @@
when using the directory-based items.
Rig link: [This is a rig link|riglink:T12896*.jpg]
+This file is UTF-8 encoded.
+
This little accent here causes a __lot__ of trouble because it encodes to
0xE5
and the encoding behavior is different whether I run this using Windows'
WPython
or cygwin's python (sigh, no kidding):