[rig3] r445 committed - IzuParser: Fix escape of double-underline in URLs....

1 view

Skip to first unread message

ri...@googlecode.com

unread,

Aug 29, 2010, 9:55:29 PM8/29/10

to rig3-d...@googlegroups.com

Revision: 445
Author: ralfoide
Date: Sun Aug 29 18:53:21 2010
Log: IzuParser: Fix escape of double-underline in URLs.
OldIzu reader: rewrite inter-izu links.
http://code.google.com/p/rig3/source/detail?r=445

Modified:
/trunk/rig3serv/src/rig/parser/izu_parser.py
/trunk/rig3serv/src/rig/source_reader.py
/trunk/rig3serv/src/tests/parser/test_izu_parser.py

=======================================
--- /trunk/rig3serv/src/rig/parser/izu_parser.py Wed Jul 7 19:31:43 2010
+++ /trunk/rig3serv/src/rig/parser/izu_parser.py Sun Aug 29 18:53:21 2010
@@ -25,7 +25,6 @@

import re
import os
-import sys
import fnmatch
import urllib
import subprocess
@@ -509,9 +508,20 @@
to escape normal [ tags and same for double-underscore,
double-quotes
"""
line = self._RE_RMV_ESC_DOUBLE_CHAR.sub(r"\2", line)
+
+ # The double-underscore case must not be escaped from within URLs
where
+ # it is valid. We define an URL context is as being a non-special
chars
+ # string that contains :// somewhere before the underline,
unfortunately
+ # we can't use a non-capturing look-behind expression (?<!...)
with a
+ # variable width, so instead we'll use a capturing group and filter
+ # using a lambda.
+ line = self._RE_RMV_ESC_DOUBLE_UNDER.sub(
+ lambda m: not m.group(1) and m.group(3) or
m.group(0), line)
+
return line

- _RE_RMV_ESC_DOUBLE_CHAR = re.compile(r"([_'=\[])(\1+)")
+ _RE_RMV_ESC_DOUBLE_CHAR = re.compile(r"(['=\[])(\1+)")
+ _RE_RMV_ESC_DOUBLE_UNDER = re.compile(r"(://(?:[^ \"\[\]_]|
_[^_])*)?(_)(\2+)")

def _ConvertAccents(self, line):
"""
@@ -523,7 +533,11 @@
line = line.replace(k, v)

try:
- us = line.decode("utf-8")
+ if isinstance(line, unicode):
+ us = line
+ else:
+ us = line.decode("utf-8")
+
for k, v in UTF8_ACCENTS_TO_HTML.iteritems():
if k in us:
us = us.replace(k, v)
=======================================
--- /trunk/rig3serv/src/rig/source_reader.py Sun Aug 29 03:05:15 2010
+++ /trunk/rig3serv/src/rig/source_reader.py Sun Aug 29 18:53:21 2010
@@ -24,8 +24,10 @@
"""
__author__ = "ralfoide at gmail com"

+import codecs
import os
import re
+from binascii import crc32
from datetime import datetime

from rig.source_item import SourceDir, SourceFile, SourceContent
@@ -91,7 +93,7 @@
DIR_PATTERN = re.compile(r"^(\d{4}[-]?\d{2}(?:[-]?\d{2})?)[ _-]
*(?P<name>.*) *$")
DIR_VALID_FILES = re.compile(r"\.(?:izu|jpe?g|html)$")
FILE_PATTERN = re.compile(r"^(\d{4}[-]?\d{2}(?:[-]?\d{2})?)[ _-]
*(?P<name>.*) *\.(?P<ext>izu|html)$")
- OLD_IZU_PATTERN = re.compile(r"^.+?\.old\.izu$")
+ OLD_IZU_PATTERN = re.compile(r"^(?P<cat>.+?)\.old\.izu$")

def __init__(self, log, site_settings, source_settings, path):
"""
@@ -134,7 +136,7 @@
for source_dir, dest_dir, all_files in tree.TraverseDirs():
basename = source_dir.basename()

- if dir_pattern.search(basename):
+ if dir_pattern.match(basename):
# This directory looks like one entry.
# Only keep the "valid" files for directory entries.
valid_files = [f for f in all_files if
dir_valid_files.search(f)]
@@ -153,12 +155,13 @@
# Not a directory entry, so check individual files to see
if they
# qualify as individual entries
for f in all_files:
- if self.OLD_IZU_PATTERN.search(f):
+ m = self.OLD_IZU_PATTERN.match(f)
+ if m:
rel_file = RelFile(source_dir.abs_base,

os.path.join(source_dir.rel_curr, f))
- self._ParseOldIzu(rel_file, items)
-
- elif file_pattern.search(f):
+ self._ParseOldIzu(rel_file, m.group("cat"), items)
+
+ elif file_pattern.match(f):
rel_file = RelFile(source_dir.abs_base,

os.path.join(source_dir.rel_curr, f))
date =
datetime.fromtimestamp(self._FileTimeStamp(rel_file.abs_path))
@@ -169,23 +172,46 @@
return items

- OLD_IZU_HEADER =
re.compile(r"^\[s:(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2}):(?P<title>[^\]]*).*$")
-
- def _ParseOldIzu(self, rel_file, items):
+ _RE_OLD_IZU_HEADER =
re.compile(r"^\[s:(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2}):(?P<title>[^\]]*).*$")
+
+ # An inter-izumi named link: [label|page/subpage#s:YYYYMMDD:title],
without [[
+ # title is optional and date must be 8-digits.
+ _RE_INTER_IZU_LINK = re.compile(r"(?<!\[)\[(?P<label>[^\|\]]+)\|
(?P<page>[^#:|\]]+)#s:(?P<date>[0-9]{8})(?::(?P<title>[^\]]+))?\]")
+
+
+ def _ParseOldIzu(self, rel_file, cat, items):
"""
"""
- f = file(rel_file.abs_path)
+ f = file(rel_file.abs_path, "rU")

SEP = "----"

# First line must have some izu tags
- tags = None
+ tags = {}
for line in f:
if line.strip() == SEP:
break
if not tags and line:
tags = IzuParser(self._log, None,
None).ParseFirstLine(line)

+ # If we find an encoding tag, reopen the file using that encoding
+ encoding = tags.get("encoding", None)
+ if encoding:
+ f.close()
+ f = codecs.open(rel_file.abs_path, mode="rU",
encoding=encoding)
+ # Skip the tag section
+ for line in f:
+ if line.strip() == SEP:
+ break
+
+ # Use the category based on the filename if there's no override in
the file tags
+ if not "cat" in tags:
+ tags["cat"] = { cat: True }
+
+ izumi_base_url = tags.get("izumi_base_url", None)
+ if izumi_base_url and not izumi_base_url.endswith("/"):
+ izumi_base_url += "/"
+
content = None
date = None
title = None
@@ -207,7 +233,7 @@
date = None
title = None

- m = self.OLD_IZU_HEADER.match(line)
+ m = self._RE_OLD_IZU_HEADER.match(line)
if m:
date = datetime(
int(m.group("year")),
@@ -216,6 +242,16 @@
title = m.group("title")

elif date and title:
+ if "#s" in line and not "|#s" in line:
+ pass
+ if izumi_base_url:
+ # Convert old inter-izumi links into hard URLs (e.g.
only links
+ # from one izumi page to another. This does not affect
intra-izumi
+ # links inside the same category, as those will be
supported by
+ # rig3 directly.)
+ line = self._RE_INTER_IZU_LINK.sub(
+ lambda m: self._ConvertInterIzuLinks(m,
izumi_base_url), line)
+
content += line

if content:
@@ -229,6 +265,28 @@

f.close()

+ def _ConvertInterIzuLinks(self, m, izumi_base_url):
+ label = m.group("label")
+ date = m.group("date")
+ title = m.group("title")
+ page = m.group("page")
+
+ # Compute the same key than RBlog::BlogEntryKey from
izumi.sourforge.net
+ # see
http://izumi.cvs.sourceforge.net/viewvc/izumi/izumi/src/RBlog.php?view=markup&pathrev=HEAD
+ # at line 348.
+ if title:
+ key = title.lower()
+ key = re.sub(r"[ \-_=+\[\]{};:'\",./<>?`~!@#$%^&*()\\|]", "_",
key)
+ key = re.sub(r"[^0123456789abcdefghijklmnopqrstuvwxyz_]", "",
key)
+ key = date + "_" + key
+ if len(key) > 32:
+ # shorten with a crc32
+ key = "%s_%x" % (key[0:23], crc32(date + title))
+ else:
+ key = date
+
+ return "[%s|%s%s?s=%s]" % (label, izumi_base_url, page, key)
+

# Utilities, overridable for unit tests

=======================================
--- /trunk/rig3serv/src/tests/parser/test_izu_parser.py Sun Apr 4 15:59:00
2010
+++ /trunk/rig3serv/src/tests/parser/test_izu_parser.py Sun Aug 29 18:53:21
2010
@@ -112,6 +112,13 @@
'\n[[[3[[[] [[2[[]',
self._Render("[[[[3[[[[] [[[2[[[]"))

+ def testNoEscapeInURLs(self):
+ # ___ (2+) can be present in an URL and should not espaced to be
replaced
+ # by n-1 underscores, as it's not a valid pattern.
+ self.assertEquals(
+ '\n<a href="http://my___url.com">some
label</a>',
+ self._Render("[some label|http://my___url.com]"))
+
def testHtmlEscapes(self):
self.assertEquals(
'\nfoo<bar>zoo&luu',

Reply all

Reply to author

Forward

0 new messages