Reviewers: Nigel Tao
Marcel van Lohuizen uploaded a change:
https://go-review.googlesource.com/7676
text/encoding: added interfaces for charset mappings.
See design notes in registry.go
Change-Id: If3c6464a2281001459a64d07afc902fb427f9c41
---
A encoding/map.go
A encoding/registry/gen.go
A encoding/registry/mib.go
A encoding/registry/registry.go
4 files changed, 576 insertions(+), 0 deletions(-)
diff --git a/encoding/map.go b/encoding/map.go
new file mode 100644
index 0000000..03c2b1f
--- /dev/null
+++ b/encoding/map.go
@@ -0,0 +1,30 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package encoding
+
+import (
+ "errors"
+)
+
+var (
+ // ErrNotSupported is returned if an encoding is known, but not supported.
+ ErrNotSupported = errors.New("encoding: encoding not supported")
+
+ // ErrInvalidLabel is indicates a given encoding is unknown.
+ ErrInvalidLabel = errors.New("encoding: label not valid")
+)
+
+// A Mapper maps labels to encodings.
+//
+// The packages mimemap and htmlmap are examples of implementations.
+type Mapper interface {
+ // Get returns an encoding for given label or an error an encoding could
not
+ // be associated with this label.
+ Get(label string) (Encoding, error)
+
+ // Name reports the canonical name of the given Encoding. It will return
the
+ // empty string if this package does not map to the given Encoding.
+ Name(e Encoding) string
+}
diff --git a/encoding/registry/gen.go b/encoding/registry/gen.go
new file mode 100644
index 0000000..a741944
--- /dev/null
+++ b/encoding/registry/gen.go
@@ -0,0 +1,76 @@
+// Copyright 2015 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build ignore
+
+package main
+
+import (
+ "bytes"
+ "encoding/xml"
+ "fmt"
+ "io"
+ "log"
+ "strings"
+
+ "
golang.org/x/text/internal/gen"
+)
+
+type registry struct {
+ XMLName xml.Name `xml:"registry"`
+ Updated string `xml:"updated"`
+ Registry []struct {
+ ID string `xml:"id,attr"`
+ Record []struct {
+ Name string `xml:"name"`
+ MIB string `xml:"value"`
+ Alias []string `xml:"alias"`
+ MIME string `xml:"preferred_alias"`
+ } `xml:"record"`
+ } `xml:"registry"`
+}
+
+func main() {
+ r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml")
+ reg := ®istry{}
+ if err := xml.NewDecoder(r).Decode(®); err != nil && err != io.EOF {
+ log.Fatalf("Error decoding charset registry: %v", err)
+ }
+ if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" {
+ log.Fatalf("Unexpected ID %s", reg.Registry[0].ID)
+ }
+
+ w := &bytes.Buffer{}
+ fmt.Fprintln(w, "const (")
+ fmt.Fprintln(w, "// Other is used if a known character set is not
currently registered by IANA.")
+ fmt.Fprintln(w, "Other MIB = 1")
+ fmt.Fprintln(w, "// Unknown is used when a given encoding type is not
known.")
+ fmt.Fprintln(w, "Unknown MIB = 2")
+ for _, rec := range reg.Registry[0].Record {
+ constName := ""
+ for _, a := range rec.Alias {
+ if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 {
+ // Some of the constant definitions have comments in them. Strip those.
+ constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0])
+ }
+ }
+ if constName == "" {
+ switch rec.MIB {
+ case "2085":
+ constName = "HZGB2312" // Not listed as alias for some reason.
+ default:
+ log.Fatalf("No cs alias defined for %s.", rec.MIB)
+ }
+ }
+ fmt.Fprintf(w, "%s MIB = %s // %s", constName, rec.MIB, rec.Name)
+ if rec.MIME != "" {
+ fmt.Fprintf(w, " (mime: %s)", rec.MIME)
+ }
+ fmt.Fprintln(w)
+ }
+ fmt.Fprintln(w, ")")
+
+ gen.WriteGoFile("mib.go", "registry", w.Bytes())
+
+}
diff --git a/encoding/registry/mib.go b/encoding/registry/mib.go
new file mode 100644
index 0000000..9e8389e
--- /dev/null
+++ b/encoding/registry/mib.go
@@ -0,0 +1,267 @@
+// This file was generated by go generate; DO NOT EDIT
+
+package registry
+
+const (
+ // Other is used if a known character set is not currently registered by
IANA.
+ Other MIB = 1
+ // Unknown is used when a given encoding type is not known.
+ Unknown MIB = 2
+ ASCII MIB = 3 // US-ASCII (mime: US-ASCII)
+ ISOLatin1 MIB = 4 // ISO_8859-1:1987 (mime: ISO-8859-1)
+ ISOLatin2 MIB = 5 // ISO_8859-2:1987 (mime: ISO-8859-2)
+ ISOLatin3 MIB = 6 // ISO_8859-3:1988 (mime: ISO-8859-3)
+ ISOLatin4 MIB = 7 // ISO_8859-4:1988 (mime: ISO-8859-4)
+ ISOLatinCyrillic MIB = 8 // ISO_8859-5:1988 (mime: ISO-8859-5)
+ ISOLatinArabic MIB = 9 // ISO_8859-6:1987 (mime: ISO-8859-6)
+ ISOLatinGreek MIB = 10 // ISO_8859-7:1987 (mime: ISO-8859-7)
+ ISOLatinHebrew MIB = 11 // ISO_8859-8:1988 (mime: ISO-8859-8)
+ ISOLatin5 MIB = 12 // ISO_8859-9:1989 (mime: ISO-8859-9)
+ ISOLatin6 MIB = 13 // ISO-8859-10 (mime: ISO-8859-10)
+ ISOTextComm MIB = 14 // ISO_6937-2-add
+ HalfWidthKatakana MIB = 15 // JIS_X0201
+ JISEncoding MIB = 16 // JIS_Encoding
+ ShiftJIS MIB = 17 // Shift_JIS (mime: Shift_JIS)
+ EUCPkdFmtJapanese MIB = 18 //
Extended_UNIX_Code_Packed_Format_for_Japanese (mime: EUC-JP)
+ EUCFixWidJapanese MIB = 19 //
Extended_UNIX_Code_Fixed_Width_for_Japanese
+ ISO4UnitedKingdom MIB = 20 // BS_4730
+ ISO11SwedishForNames MIB = 21 // SEN_850200_C
+ ISO15Italian MIB = 22 // IT
+ ISO17Spanish MIB = 23 // ES
+ ISO21German MIB = 24 // DIN_66003
+ ISO60Norwegian1 MIB = 25 // NS_4551-1
+ ISO69French MIB = 26 // NF_Z_62-010
+ ISO10646UTF1 MIB = 27 // ISO-10646-UTF-1
+ ISO646basic1983 MIB = 28 // ISO_646.basic:1983
+ INVARIANT MIB = 29 // INVARIANT
+ ISO2IntlRefVersion MIB = 30 // ISO_646.irv:1983
+ NATSSEFI MIB = 31 // NATS-SEFI
+ NATSSEFIADD MIB = 32 // NATS-SEFI-ADD
+ NATSDANO MIB = 33 // NATS-DANO
+ NATSDANOADD MIB = 34 // NATS-DANO-ADD
+ ISO10Swedish MIB = 35 // SEN_850200_B
+ KSC56011987 MIB = 36 // KS_C_5601-1987
+ ISO2022KR MIB = 37 // ISO-2022-KR (mime: ISO-2022-KR)
+ EUCKR MIB = 38 // EUC-KR (mime: EUC-KR)
+ ISO2022JP MIB = 39 // ISO-2022-JP (mime: ISO-2022-JP)
+ ISO2022JP2 MIB = 40 // ISO-2022-JP-2 (mime: ISO-2022-JP-2)
+ ISO13JISC6220jp MIB = 41 // JIS_C6220-1969-jp
+ ISO14JISC6220ro MIB = 42 // JIS_C6220-1969-ro
+ ISO16Portuguese MIB = 43 // PT
+ ISO18Greek7Old MIB = 44 // greek7-old
+ ISO19LatinGreek MIB = 45 // latin-greek
+ ISO25French MIB = 46 // NF_Z_62-010_(1973)
+ ISO27LatinGreek1 MIB = 47 // Latin-greek-1
+ ISO5427Cyrillic MIB = 48 // ISO_5427
+ ISO42JISC62261978 MIB = 49 // JIS_C6226-1978
+ ISO47BSViewdata MIB = 50 // BS_viewdata
+ ISO49INIS MIB = 51 // INIS
+ ISO50INIS8 MIB = 52 // INIS-8
+ ISO51INISCyrillic MIB = 53 // INIS-cyrillic
+ ISO54271981 MIB = 54 // ISO_5427:1981
+ ISO5428Greek MIB = 55 // ISO_5428:1980
+ ISO57GB1988 MIB = 56 // GB_1988-80
+ ISO58GB231280 MIB = 57 // GB_2312-80
+ ISO61Norwegian2 MIB = 58 // NS_4551-2
+ ISO70VideotexSupp1 MIB = 59 // videotex-suppl
+ ISO84Portuguese2 MIB = 60 // PT2
+ ISO85Spanish2 MIB = 61 // ES2
+ ISO86Hungarian MIB = 62 // MSZ_7795.3
+ ISO87JISX0208 MIB = 63 // JIS_C6226-1983
+ ISO88Greek7 MIB = 64 // greek7
+ ISO89ASMO449 MIB = 65 // ASMO_449
+ ISO90 MIB = 66 // iso-ir-90
+ ISO91JISC62291984a MIB = 67 // JIS_C6229-1984-a
+ ISO92JISC62991984b MIB = 68 // JIS_C6229-1984-b
+ ISO93JIS62291984badd MIB = 69 // JIS_C6229-1984-b-add
+ ISO94JIS62291984hand MIB = 70 // JIS_C6229-1984-hand
+ ISO95JIS62291984handadd MIB = 71 // JIS_C6229-1984-hand-add
+ ISO96JISC62291984kana MIB = 72 // JIS_C6229-1984-kana
+ ISO2033 MIB = 73 // ISO_2033-1983
+ ISO99NAPLPS MIB = 74 // ANSI_X3.110-1983
+ ISO102T617bit MIB = 75 // T.61-7bit
+ ISO103T618bit MIB = 76 // T.61-8bit
+ ISO111ECMACyrillic MIB = 77 // ECMA-cyrillic
+ ISO121Canadian1 MIB = 78 // CSA_Z243.4-1985-1
+ ISO122Canadian2 MIB = 79 // CSA_Z243.4-1985-2
+ ISO123CSAZ24341985gr MIB = 80 // CSA_Z243.4-1985-gr
+ ISO88596E MIB = 81 // ISO_8859-6-E (mime: ISO-8859-6-E)
+ ISO88596I MIB = 82 // ISO_8859-6-I (mime: ISO-8859-6-I)
+ ISO128T101G2 MIB = 83 // T.101-G2
+ ISO88598E MIB = 84 // ISO_8859-8-E (mime: ISO-8859-8-E)
+ ISO88598I MIB = 85 // ISO_8859-8-I (mime: ISO-8859-8-I)
+ ISO139CSN369103 MIB = 86 // CSN_369103
+ ISO141JUSIB1002 MIB = 87 // JUS_I.B1.002
+ ISO143IECP271 MIB = 88 // IEC_P27-1
+ ISO146Serbian MIB = 89 // JUS_I.B1.003-serb
+ ISO147Macedonian MIB = 90 // JUS_I.B1.003-mac
+ ISO150GreekCCITT MIB = 91 // greek-ccitt
+ ISO151Cuba MIB = 92 // NC_NC00-10:81
+ ISO6937Add MIB = 93 // ISO_6937-2-25
+ ISO153GOST1976874 MIB = 94 // GOST_19768-74
+ ISO8859Supp MIB = 95 // ISO_8859-supp
+ ISO10367Box MIB = 96 // ISO_10367-box
+ ISO158Lap MIB = 97 // latin-lap
+ ISO159JISX02121990 MIB = 98 // JIS_X0212-1990
+ ISO646Danish MIB = 99 // DS_2089
+ USDK MIB = 100 // us-dk
+ DKUS MIB = 101 // dk-us
+ KSC5636 MIB = 102 // KSC5636
+ Unicode11UTF7 MIB = 103 // UNICODE-1-1-UTF-7
+ ISO2022CN MIB = 104 // ISO-2022-CN
+ ISO2022CNEXT MIB = 105 // ISO-2022-CN-EXT
+ UTF8 MIB = 106 // UTF-8
+ ISO885913 MIB = 109 // ISO-8859-13
+ ISO885914 MIB = 110 // ISO-8859-14
+ ISO885915 MIB = 111 // ISO-8859-15
+ ISO885916 MIB = 112 // ISO-8859-16
+ GBK MIB = 113 // GBK
+ GB18030 MIB = 114 // GB18030
+ OSDEBCDICDF0415 MIB = 115 // OSD_EBCDIC_DF04_15
+ OSDEBCDICDF03IRV MIB = 116 // OSD_EBCDIC_DF03_IRV
+ OSDEBCDICDF041 MIB = 117 // OSD_EBCDIC_DF04_1
+ ISO115481 MIB = 118 // ISO-11548-1
+ KZ1048 MIB = 119 // KZ-1048
+ Unicode MIB = 1000 // ISO-10646-UCS-2
+ UCS4 MIB = 1001 // ISO-10646-UCS-4
+ UnicodeASCII MIB = 1002 // ISO-10646-UCS-Basic
+ UnicodeLatin1 MIB = 1003 // ISO-10646-Unicode-Latin1
+ UnicodeJapanese MIB = 1004 // ISO-10646-J-1
+ UnicodeIBM1261 MIB = 1005 // ISO-Unicode-IBM-1261
+ UnicodeIBM1268 MIB = 1006 // ISO-Unicode-IBM-1268
+ UnicodeIBM1276 MIB = 1007 // ISO-Unicode-IBM-1276
+ UnicodeIBM1264 MIB = 1008 // ISO-Unicode-IBM-1264
+ UnicodeIBM1265 MIB = 1009 // ISO-Unicode-IBM-1265
+ Unicode11 MIB = 1010 // UNICODE-1-1
+ SCSU MIB = 1011 // SCSU
+ UTF7 MIB = 1012 // UTF-7
+ UTF16BE MIB = 1013 // UTF-16BE
+ UTF16LE MIB = 1014 // UTF-16LE
+ UTF16 MIB = 1015 // UTF-16
+ CESU8 MIB = 1016 // CESU-8
+ UTF32 MIB = 1017 // UTF-32
+ UTF32BE MIB = 1018 // UTF-32BE
+ UTF32LE MIB = 1019 // UTF-32LE
+ BOCU1 MIB = 1020 // BOCU-1
+ Windows30Latin1 MIB = 2000 // ISO-8859-1-Windows-3.0-Latin-1
+ Windows31Latin1 MIB = 2001 // ISO-8859-1-Windows-3.1-Latin-1
+ Windows31Latin2 MIB = 2002 // ISO-8859-2-Windows-Latin-2
+ Windows31Latin5 MIB = 2003 // ISO-8859-9-Windows-Latin-5
+ HPRoman8 MIB = 2004 // hp-roman8
+ AdobeStandardEncoding MIB = 2005 // Adobe-Standard-Encoding
+ VenturaUS MIB = 2006 // Ventura-US
+ VenturaInternational MIB = 2007 // Ventura-International
+ DECMCS MIB = 2008 // DEC-MCS
+ PC850Multilingual MIB = 2009 // IBM850
+ PC8DanishNorwegian MIB = 2012 // PC8-Danish-Norwegian
+ PC862LatinHebrew MIB = 2013 // IBM862
+ PC8Turkish MIB = 2014 // PC8-Turkish
+ IBMSymbols MIB = 2015 // IBM-Symbols
+ IBMThai MIB = 2016 // IBM-Thai
+ HPLegal MIB = 2017 // HP-Legal
+ HPPiFont MIB = 2018 // HP-Pi-font
+ HPMath8 MIB = 2019 // HP-Math8
+ HPPSMath MIB = 2020 // Adobe-Symbol-Encoding
+ HPDesktop MIB = 2021 // HP-DeskTop
+ VenturaMath MIB = 2022 // Ventura-Math
+ MicrosoftPublishing MIB = 2023 // Microsoft-Publishing
+ Windows31J MIB = 2024 // Windows-31J
+ GB2312 MIB = 2025 // GB2312 (mime: GB2312)
+ Big5 MIB = 2026 // Big5 (mime: Big5)
+ Macintosh MIB = 2027 // macintosh
+ IBM037 MIB = 2028 // IBM037
+ IBM038 MIB = 2029 // IBM038
+ IBM273 MIB = 2030 // IBM273
+ IBM274 MIB = 2031 // IBM274
+ IBM275 MIB = 2032 // IBM275
+ IBM277 MIB = 2033 // IBM277
+ IBM278 MIB = 2034 // IBM278
+ IBM280 MIB = 2035 // IBM280
+ IBM281 MIB = 2036 // IBM281
+ IBM284 MIB = 2037 // IBM284
+ IBM285 MIB = 2038 // IBM285
+ IBM290 MIB = 2039 // IBM290
+ IBM297 MIB = 2040 // IBM297
+ IBM420 MIB = 2041 // IBM420
+ IBM423 MIB = 2042 // IBM423
+ IBM424 MIB = 2043 // IBM424
+ PC8CodePage437 MIB = 2011 // IBM437
+ IBM500 MIB = 2044 // IBM500
+ IBM851 MIB = 2045 // IBM851
+ PCp852 MIB = 2010 // IBM852
+ IBM855 MIB = 2046 // IBM855
+ IBM857 MIB = 2047 // IBM857
+ IBM860 MIB = 2048 // IBM860
+ IBM861 MIB = 2049 // IBM861
+ IBM863 MIB = 2050 // IBM863
+ IBM864 MIB = 2051 // IBM864
+ IBM865 MIB = 2052 // IBM865
+ IBM868 MIB = 2053 // IBM868
+ IBM869 MIB = 2054 // IBM869
+ IBM870 MIB = 2055 // IBM870
+ IBM871 MIB = 2056 // IBM871
+ IBM880 MIB = 2057 // IBM880
+ IBM891 MIB = 2058 // IBM891
+ IBM903 MIB = 2059 // IBM903
+ IBBM904 MIB = 2060 // IBM904
+ IBM905 MIB = 2061 // IBM905
+ IBM918 MIB = 2062 // IBM918
+ IBM1026 MIB = 2063 // IBM1026
+ IBMEBCDICATDE MIB = 2064 // EBCDIC-AT-DE
+ EBCDICATDEA MIB = 2065 // EBCDIC-AT-DE-A
+ EBCDICCAFR MIB = 2066 // EBCDIC-CA-FR
+ EBCDICDKNO MIB = 2067 // EBCDIC-DK-NO
+ EBCDICDKNOA MIB = 2068 // EBCDIC-DK-NO-A
+ EBCDICFISE MIB = 2069 // EBCDIC-FI-SE
+ EBCDICFISEA MIB = 2070 // EBCDIC-FI-SE-A
+ EBCDICFR MIB = 2071 // EBCDIC-FR
+ EBCDICIT MIB = 2072 // EBCDIC-IT
+ EBCDICPT MIB = 2073 // EBCDIC-PT
+ EBCDICES MIB = 2074 // EBCDIC-ES
+ EBCDICESA MIB = 2075 // EBCDIC-ES-A
+ EBCDICESS MIB = 2076 // EBCDIC-ES-S
+ EBCDICUK MIB = 2077 // EBCDIC-UK
+ EBCDICUS MIB = 2078 // EBCDIC-US
+ Unknown8BiT MIB = 2079 // UNKNOWN-8BIT
+ Mnemonic MIB = 2080 // MNEMONIC
+ Mnem MIB = 2081 // MNEM
+ VISCII MIB = 2082 // VISCII
+ VIQR MIB = 2083 // VIQR
+ KOI8R MIB = 2084 // KOI8-R (mime: KOI8-R)
+ HZGB2312 MIB = 2085 // HZ-GB-2312
+ IBM866 MIB = 2086 // IBM866
+ PC775Baltic MIB = 2087 // IBM775
+ KOI8U MIB = 2088 // KOI8-U
+ IBM00858 MIB = 2089 // IBM00858
+ IBM00924 MIB = 2090 // IBM00924
+ IBM01140 MIB = 2091 // IBM01140
+ IBM01141 MIB = 2092 // IBM01141
+ IBM01142 MIB = 2093 // IBM01142
+ IBM01143 MIB = 2094 // IBM01143
+ IBM01144 MIB = 2095 // IBM01144
+ IBM01145 MIB = 2096 // IBM01145
+ IBM01146 MIB = 2097 // IBM01146
+ IBM01147 MIB = 2098 // IBM01147
+ IBM01148 MIB = 2099 // IBM01148
+ IBM01149 MIB = 2100 // IBM01149
+ Big5HKSCS MIB = 2101 // Big5-HKSCS
+ IBM1047 MIB = 2102 // IBM1047
+ PTCP154 MIB = 2103 // PTCP154
+ Amiga1251 MIB = 2104 // Amiga-1251
+ KOI7switched MIB = 2105 // KOI7-switched
+ BRF MIB = 2106 // BRF
+ TSCII MIB = 2107 // TSCII
+ CP51932 MIB = 2108 // CP51932
+ Windows874 MIB = 2109 // windows-874
+ Windows1250 MIB = 2250 // windows-1250
+ Windows1251 MIB = 2251 // windows-1251
+ Windows1252 MIB = 2252 // windows-1252
+ Windows1253 MIB = 2253 // windows-1253
+ Windows1254 MIB = 2254 // windows-1254
+ Windows1255 MIB = 2255 // windows-1255
+ Windows1256 MIB = 2256 // windows-1256
+ Windows1257 MIB = 2257 // windows-1257
+ Windows1258 MIB = 2258 // windows-1258
+ TIS620 MIB = 2259 // TIS-620
+ CP50220 MIB = 2260 // CP50220
+)
diff --git a/encoding/registry/registry.go b/encoding/registry/registry.go
new file mode 100644
index 0000000..b0a671b
--- /dev/null
+++ b/encoding/registry/registry.go
@@ -0,0 +1,203 @@
+// Copyright 2013 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+//go:generate go run gen.go
+
+// Package registry provides a standardized mechanism for Encodings to
report
+// the type of encoding they support to an arbitrary implementation of a
+// character set mapping.
+//
+// This package connects mappings with encodings. If you are neither
writing a
+// Mapper nor an Encoding, you probably will be more interested in Mapper
+// implementations like the one defined in the packages htmlmap and
mimemap.
+package registry
+
+// Design Notes
+//
+// There seem to be different standards for mapping labels to encodings,
each
+// tailored to specific applications. Adhering to any of these standard
seems
+// bound to lead to suboptimal results for some applications. For this
reason,
+// we assume it is better to different implementations for different
purposes.
+// A Mapper interface is defined in the encoding package to capture this.
+//
+// There are two parts to the mapping problem: 1) defining sets of
encodings and
+// 2) defining how to map a larger set of labels to these sets. This
package
+// alleviates the first issue. The vast majority of encodings can be
labeled by
+// one of the MIB codes in the IANA character set registry (see
+//
http://www.iana.org/assignments/character-sets /character-sets.xhtml).
The
+// Mapper interface handles the second.
+//
+// A registration mechanism isn't really needed when defining a small
mapping to
+// a very limited set of encodings (such as would be necessary for HTML 5
+// support). Things get murkier when mapping to a larger set, where
encodings
+// could be implemented by third parties. In such a scenario, Mapper
+// implementers would need to know about all encodings it would want to use
+// and/or encoder implementers would need to know about all mappings it
would
+// want to be added to. The registration system standardizes on a common
set of
+// identifiers and simplifies discovery.
+//
+// Some constraints that lead to this design:
+// - Allow different mapping implementations.
+// - Don't use init(): the order of init evaluations may lead to unexpected
+// results. Also, using init may force tables to be linked and may
constrain
+// init-time initialization of mappers.
+// - Users should typically not need to know about IANA MIB enums, but only
+// be concerned with domain-specific labels as handled by the Mappers.
+// - In the event a user does not use mappers, the overhead of the
registration
+// mechanism should be minimal. It should also not force tables to be
loaded
+// unnecessarily.
+// - Allow a subset of encodings to be selected.
+// So in the ultimate design, encodings still need to be added
programmatically
+// to a Mapper. It allows for this to be done at the package level,
however,
+// grouping multiple encodings together.
+//
+// Note that not all character sets are covered by this approach. For this
+// purpose we define a separate registry for special codes. We keep both
+// mechanisms for various reasons:
+// - It provides stricter standardization for the common case.
+// - It allows users to more easily distinguish between a common encoding
or a
+// specialized one.
+// - Handling enums is more efficient and causes less overhead.
+// Note that the use of the enums is largely hidden from the user. Mappers
do
+// not expose them in the API.
+//
+// We also did not opt for the more elaborate and extensive classification
+// defined in Unicode TR #22 (
http://www.unicode.org/reports/tr22/). This
report
+// argues that the standard way of character set mapping is not
sufficiently
+// precise. We think the added precision (and complexity) is not
necessary at
+// this point:
+// - Mappers are mostly used for mapping standard identifiers, in which
case
+// the label space is already lacking precision.
+// - In Go, character sets are accessible directly from the API; to get
more
+// precision one can simply bypass the registry system.
+// - The registry system has been designed to allow adding other options
and
+// registration methods for better precision (e.g. versions, vendors,
etc.)
+// at a later point.
+// - The mechanism proposed in TR #22 is not widely used, which suggests
that
+// the simpler mechanism is sufficient, especially in light of the
previous
+// points.
+// Either way, TR #22 gives useful insights in handling encodings and
should be
+// heeded.
+//
+// References:
+// -
http://www.iana.org/assignments/character-sets/character-sets.xhtml
+// -
http://www.w3.org/TR/encoding/
+// -
http://www.unicode.org/reports/tr22/
+
+import (
+ "errors"
+
+ "
golang.org/x/text/encoding"
+)
+
+// MIB is an IANA character set module identifier plus a constant for some
+// common identifiers not covered by IANA.
+//
+// See
http://www.iana.org/assignments/ianacharset-mib.
+type MIB uint16
+
+// These additional MIB types are not standard, but are added because they
are
+// common, but not defined in IANA. Their
+const (
+ // XUserDefined is the code for x-user-defined.
+ XUserDefined MIB = 10000 + iota
+
+ // MacintoshCyrillic is the code for x-mac-cyrillic.
+ MacintoshCyrillic
+)
+
+// ErrNoTypeInfo is an error that is returned if an attempt was made to
get the
+// type of an Encoding that does not support the Registerer interface.
+var ErrNoTypeInfo = errors.New("encoding: encoding does not implement
Registerer interface")
+
+// Registerer can be implemented by Encodings to indicate the encodings it
+// supports.
+type Registerer interface {
+ Register(r Registrar) error
+}
+
+// A Registrar records the type of an Encoding. It can be used to create
+// character set mappings.
+type Registrar interface {
+ // Register is the preferred method of associating an Encoding with a
type.
+ // A Registrar may return an error if it does not support encodings of the
+ // given type.
+ Register(code MIB, e encoding.Encoding) error
+
+ // RegisterOther is mostly intended to register encodings that cannot be
+ // encoded within the IANA MIB scheme. A Registrar may return an error if
+ // the label is not recognized or not supported. The label must be
+ // canonical as defined by the corresponding Mapping implementation.
+ RegisterOther(label string, e encoding.Encoding) error
+}
+
+// Register adds the given encodings to the given registrar. The encodings
must
+// implement the Registerer interface. It will attempt to register all
encodings
+// even if some cause an error when registering.
+func Register(r Registrar, enc ...encoding.Encoding) error {
+ var err error
+ for _, e := range enc {
+ rr, ok := e.(Registerer)
+ if !ok {
+ err = ErrNoTypeInfo
+ continue
+ }
+ if errSup := rr.Register(r); errSup != nil && err == nil {
+ err = errSup
+ }
+ }
+ return err
+}
+
+// Type returns the first-reported type of e. It returns ErrNoTypeInfo if
e does
+// not report a type.
+func Type(e encoding.Encoding) (Info, error) {
+ if rr, ok := e.(Registerer); ok {
+ var g getFirst
+ rr.Register(&g)
+ if g.have {
+ return
g.info, nil
+ }
+ }
+ return Info{}, ErrNoTypeInfo
+}
+
+// Info encapsulates type information of an encoding.
+type Info struct {
+ mib MIB
+ other string
+}
+
+// Code returns the MIB code of the encoding. If it returns Other, method
label
+// will return the alternative key used for the registration.
+func (i *Info) Code() MIB {
+ return i.mib
+}
+
+// Label returns the label used for the encoding in case Code returns
Other.
+func (i *Info) Label() string {
+ return i.other
+}
+
+// getFirst is a registrar that simply records the first registered type.
+type getFirst struct {
+ have bool
+ info Info
+}
+
+func (g *getFirst) Register(code MIB, e encoding.Encoding) error {
+ if !g.have {
+ g.info.mib = code
+ g.have = true
+ }
+ return nil
+}
+
+func (g *getFirst) RegisterOther(label string, e encoding.Encoding) error {
+ if !g.have {
+ g.info.other = label
+ g.have = true
+ }
+ return nil
+}
--
https://go-review.googlesource.com/7676
Gerrit-Reviewer: Nigel Tao <
nige...@golang.org>