idnits 2.17.1 draft-ietf-idn-step-01.txt: -(264): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(286): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(289): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(300): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(319): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(375): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(422): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(453): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(482): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(535): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(548): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(553): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(554): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(558): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(561): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(562): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(595): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(596): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(748): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(761): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(762): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(764): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(767): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(863): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(864): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(872): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(883): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(886): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(887): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(944): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(964): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1076): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1085): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1211): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1212): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1213): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1217): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1253): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1288): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1290): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1314): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1388): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1581): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1582): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1690): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1732): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1738): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1817): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1826): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1865): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding -(1871): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding Checking boilerplate required by RFC 5378 and the IETF Trust (see https://trustee.ietf.org/license-info): ---------------------------------------------------------------------------- ** Looks like you're using RFC 2026 boilerplate. This must be updated to follow RFC 3978/3979, as updated by RFC 4748. Checking nits according to https://www.ietf.org/id-info/1id-guidelines.txt: ---------------------------------------------------------------------------- ** The document seems to lack a 1id_guidelines paragraph about 6 months document validity -- however, there's a paragraph with a matching beginning. Boilerplate error? ** The document is more than 15 pages and seems to lack a Table of Contents. == There are 113 instances of lines with non-ascii characters in the document. == No 'Intended status' indicated for this document; assuming Proposed Standard == The page length should not exceed 58 lines per page, but there was 1 longer page, the longest (page 1) being 2188 lines Checking nits according to https://www.ietf.org/id-info/checklist : ---------------------------------------------------------------------------- ** The document seems to lack separate sections for Informative/Normative References. All references will be assumed normative when checking for downward references. ** There are 68 instances of too long lines in the document, the longest one being 6 characters in excess of 72. ** There are 456 instances of lines with control characters in the document. ** The abstract seems to contain references ([ISO639]), which it shouldn't. Please replace those with straight textual mentions of the documents in question. Miscellaneous warnings: ---------------------------------------------------------------------------- == Line 641 has weird spacing: '...Chinese zh-...' == Line 643 has weird spacing: '...Deutsch de-...' == Line 644 has weird spacing: '...English en-...' == Line 645 has weird spacing: '...peranto eo-...' == Line 649 has weird spacing: '...apanese ja-...' == (14 more instances...) == The document seems to lack the recommended RFC 2119 boilerplate, even if it appears to use RFC 2119 keywords. (The document does seem to have the reference to RFC 2119 which the ID-Checklist requires). -- The document seems to lack a disclaimer for pre-RFC5378 work, but may have content which was first submitted before 10 November 2008. If you have contacted all the original authors and they are all willing to grant the BCP78 rights to the IETF Trust, then this is fine, and you can ignore this comment. If not, you may need to add the pre-RFC5378 disclaimer. (See the Legal Provisions document at https://trustee.ietf.org/license-info for more information.) -- Couldn't find a document date in the document -- date freshness check skipped. -- Found something which looks like a code comment -- if you have code sections in the document, please surround them with '' and '' lines. Checking references for intended status: Proposed Standard ---------------------------------------------------------------------------- (See RFCs 3967 and 4897 for information about using normative references to lower-maturity documents in RFCs) == Missing Reference: 'ISO 639' is mentioned on line 36, but not defined == Missing Reference: 'UCS' is mentioned on line 247, but not defined == Missing Reference: 'Uname' is mentioned on line 268, but not defined == Missing Reference: 'IDNmap' is mentioned on line 1724, but not defined == Missing Reference: 'Unicode3' is mentioned on line 361, but not defined == Missing Reference: 'Nameprep' is mentioned on line 1088, but not defined == Missing Reference: 'Stone' is mentioned on line 1263, but not defined == Missing Reference: 'A-Za-z0-9' is mentioned on line 463, but not defined == Missing Reference: 'T' is mentioned on line 491, but not defined == Missing Reference: 'P1' is mentioned on line 491, but not defined == Missing Reference: 'L1' is mentioned on line 491, but not defined == Missing Reference: 'P2' is mentioned on line 491, but not defined == Missing Reference: 'L2' is mentioned on line 491, but not defined == Missing Reference: 'Py' is mentioned on line 491, but not defined -- Looks like a reference, but probably isn't: '0' on line 521 == Missing Reference: 'SLS' is mentioned on line 1250, but not defined == Missing Reference: 'RFC 2825' is mentioned on line 1266, but not defined == Missing Reference: 'R1' is mentioned on line 1404, but not defined == Missing Reference: 'R2' is mentioned on line 1433, but not defined == Missing Reference: 'R3' is mentioned on line 1558, but not defined == Missing Reference: 'R1-R3' is mentioned on line 1503, but not defined == Missing Reference: 'R4' is mentioned on line 1508, but not defined == Missing Reference: 'R1-R3-R4' is mentioned on line 1559, but not defined == Missing Reference: 'Rx-Ry-Rz-R4' is mentioned on line 1563, but not defined == Missing Reference: 'Appendix' is mentioned on line 1901, but not defined == Missing Reference: 'UAX 15' is mentioned on line 1665, but not defined == Missing Reference: 'Bidi' is mentioned on line 1665, but not defined == Missing Reference: 'MAXdatalen' is mentioned on line 2137, but not defined == Missing Reference: 'FILENAMSIZ' is mentioned on line 1979, but not defined == Unused Reference: 'ASCII' is defined on line 1781, but no explicit reference was found in the text == Unused Reference: 'Dictionary79' is defined on line 1791, but no explicit reference was found in the text == Unused Reference: 'IDNReq' is defined on line 1797, but no explicit reference was found in the text == Unused Reference: 'ISO639' is defined on line 1803, but no explicit reference was found in the text == Unused Reference: 'PinyinCon' is defined on line 1817, but no explicit reference was found in the text == Unused Reference: 'Macmillan93' is defined on line 1821, but no explicit reference was found in the text == Unused Reference: 'RFC 2026' is defined on line 1826, but no explicit reference was found in the text == Unused Reference: 'RFC2277' is defined on line 1832, but no explicit reference was found in the text == Unused Reference: 'RFC2396' is defined on line 1835, but no explicit reference was found in the text == Unused Reference: 'SIS' is defined on line 1841, but no explicit reference was found in the text == Unused Reference: 'RFC2825' is defined on line 1847, but no explicit reference was found in the text == Unused Reference: 'UAX15' is defined on line 1850, but no explicit reference was found in the text == Unused Reference: 'UNICODE' is defined on line 1854, but no explicit reference was found in the text == Unused Reference: 'UNICODE30' is defined on line 1857, but no explicit reference was found in the text == Unused Reference: 'Versions' is defined on line 1865, but no explicit reference was found in the text -- Possible downref: Non-RFC (?) normative reference: ref. 'ASCII' -- Possible downref: Normative reference to a draft: ref. 'CJK' -- Possible downref: Non-RFC (?) normative reference: ref. 'DeFrancis 1989' -- Possible downref: Non-RFC (?) normative reference: ref. 'Dictionary79' -- No information found for draft-ietf-idn-icdn - is the name correct? -- Possible downref: Normative reference to a draft: ref. 'Icdn' -- Possible downref: Normative reference to a draft: ref. 'IDNReq' -- Possible downref: Non-RFC (?) normative reference: ref. 'IPA' -- Possible downref: Non-RFC (?) normative reference: ref. 'ISO639' -- Possible downref: Non-RFC (?) normative reference: ref. 'ISO10646' -- Possible downref: Non-RFC (?) normative reference: ref. 'Hindi 98' -- Possible downref: Non-RFC (?) normative reference: ref. 'Translit 97' -- Possible downref: Non-RFC (?) normative reference: ref. 'PinyinCon' -- Possible downref: Non-RFC (?) normative reference: ref. 'Macmillan93' -- Possible downref: Non-RFC (?) normative reference: ref. 'Mnemonics' ** Obsolete normative reference: RFC 2396 (Obsoleted by RFC 3986) -- Possible downref: Non-RFC (?) normative reference: ref. 'Russian 44' == Outdated reference: A later version (-02) exists of draft-mealling-sls-00 -- Possible downref: Normative reference to a draft: ref. 'SIS' ** Downref: Normative reference to an Informational RFC: RFC 2825 -- Possible downref: Non-RFC (?) normative reference: ref. 'UAX15' -- Possible downref: Non-RFC (?) normative reference: ref. 'UNICODE' -- Possible downref: Non-RFC (?) normative reference: ref. 'UNICODE30' -- Duplicate reference: RFC2396, mentioned in 'URI', was also mentioned in 'RFC2396'. ** Obsolete normative reference: RFC 2396 (ref. 'URI') (Obsoleted by RFC 3986) -- Possible downref: Normative reference to a draft: ref. 'Versions' -- Possible downref: Non-RFC (?) normative reference: ref. 'WIPO' -- Possible downref: Non-RFC (?) normative reference: ref. 'WORLD 95' -- Possible downref: Non-RFC (?) normative reference: ref. 'Ye95' Summary: 10 errors (**), 0 flaws (~~), 54 warnings (==), 29 comments (--). Run idnits with the --verbose option for more detailed information about the items above. -------------------------------------------------------------------------------- 1 Internet Draft Liana Ye 2 draft-ietf-idn-step-01.txt Y&D ISG 3 Sept. 28, 2001 4 Obsoletes: draft-ietf-idn-step-01.txt 5 Expires in six months (March 2002) 7 StepCode - A Mnemonic Internationalized Domain Name Encoding 9 Status of this memo 11 This document is an Internet-Draft and is in full conformance with 12 all provisions of Section 10 of RFC2026. 14 Internet-Drafts are working documents of the Internet Engineering 15 Task Force (IETF), its areas, and its working groups. Note that 16 other groups may also distribute working documents as 17 Internet-Drafts. 19 Internet-Drafts are draft documents valid for a maximum of six 20 months and may be updated, replaced, or obsolete by other documents 21 at any time. It is inappropriate to use Internet-Drafts as reference 22 material or to cite them other than as "work in progress." 24 The list of current Internet-Drafts can be accessed at 25 http://www.ietf.org/ietf/1id-abstracts.txt 27 The list of Internet-Draft Shadow Directories can be accessed 28 at http://www.ietf.org/shadow.html. 30 Abstract 32 This document describes an Internationalized Domain Name (IDN) 33 Encoding method with US-ASCII [a-z0-9] characters, preserving the 34 primary sound value of such names users want, and technically 35 feasible, linguistically demanding once mechanism to represent the 36 names of multi-scripts with language tags defined by [ISO 639] in 37 the required DNS way, such that the encoded names can be used as 38 valid domain name identifiers. 40 Table of Contents 41 1. Introduction 42 1.1 Context 43 1.2 Issues 44 1.3 Romanized Multi-language Representation 45 1.4 StepCode Protocol to Represent Trade Names 46 1.5 StepCode Features 47 1.6 Disclaimer 48 1.7 Terminology 49 1.8 IDN summary 50 2. Host Name Transformation 51 2.1 Syntax of StepCode 52 2.2 Glyph Boundary Marks 53 2.3 Encoding Steps 54 2.4 Transliteration Schemes 55 2.5 Alphabetic Script Transformation �C Mechanical Methods 56 2.6 Consonant Script Transformation - Developmental Issues 57 2.7 Character Script Transformation �C Feasibility 58 2.8 Mixed Script Transformation �C Implementing Japanese 59 3. Numerical Symbol Value Assignment 60 3.1 Diacritic Marks 61 3.2 Phoneme Table 62 3.3 Overflowing 63 3.4 Priority List 64 3.5 Radical Layout Indicators 65 4. Language Specific Procedures 66 4.1 IDN Input Normalization Procedures 67 4.2 DNS Fitting Procedures 68 5. Embodiment of StepCode Protocol 69 ... 71 Tables: 72 Table 1. Romanized Latin Letter Assignments 73 Table 2. Top four non-native languages used in the world 74 Table 3. Russian Transliteration Table 75 Table 4. Two methods to expend the Latin script 76 Table 5. IDN Hindi Section Map 77 Table 6. General Diacritics Mapping Table 78 Table 7. Example of Using Diacritics mapping (French) 79 Table 8. Example Phoneme Mapping (Subset of IPA) 80 Table 9. Example use of Overflowing mapping (Chinese) 81 Table 10. Example use of Priority Mapping (English) 82 Table 11. Glyph Layout Numeral Values 84 1. Introduction 86 Symbolic representation of a concept takes on many forms. It can be 87 encrypted to conceal from a human reader, it can be compressed for a 88 mechanical program reader, and it can be an icon for any spoken 89 language readers. For a domain name represents an entity as an 90 individual, a product, or an organization, it has to be readable for 91 human readers both in their native languages as well as for human 92 readers not in that native languages, in addition to a computer 93 program reader which only reads code points. To bridge the three types 94 of requirement, StepCode is proposed to transform a native symbol to 95 one or more universal ASCII symbols in a mechanical manner for a 96 mechanical program reader. 98 1.1 Context 100 Although world-wide desire to use characters other than plain ASCII 101 in hostnames is bubbling up and accelerating, ICANN has to take 102 a cautious approach on adopting an international domain name system, 103 for the fear of duplicated or confused new domain names. The challenge 104 of how to represent the names users want in the DNS in a way that is 105 clear, technically feasible, and unique is still an open issue. 107 1.2. Issues in Multilingual Representation of DNS Host Names 109 A basic technical issue regarding a name is sorting and searching 110 zone files or name servers of hostname identifiers containing 111 different written languages for potentially very large numbers of 112 users online, say 10% of the world's population. Hostname 113 identification could become a bottleneck for internet traffic if 114 sorting and searching has to be treated 1) in more than one set of 115 partially overlapping or mixed or possibly mixed symbolic 116 representations; and 2) mostly in compressed or semantically random 117 ordered zone files scattered around the globe, as in the Shared 118 Registration System (�SRS�) since 1999 installation. 120 Historically, Character-formed script such as CJK characters has 121 inherent sorting and indexing difficulties and is used to be an 122 intellectual activity just to use a dictionary. In fact, it has been 123 a primary problem in computer processing of Oriental languages since 124 the early development of computer industry. After almost importunate 125 research and development in the past decades, the solution are all 126 based on some types of table search, and the nature of such a 127 processing has been well understood, and the techniques are ready to 128 be applied to very large character set, such as Universal Character 129 Set [UCS]. 131 With the experiences we have obtained from Oriental languages 132 processing, and suppose that we have solved such an indexing problem 133 and have accommodated mixed scripts such as Japanese and Korean, and 134 IDN goes to a character-form based system, then it is foreseeable 135 that IDN system will have to support a text based DNS system as 136 well for a long time. After all, the DNS system is a historically 137 successful system. To throw such a system away is like asking 138 people to stop shopping at supermarkets and pick up their lettuce 139 on the Internet. Then it is certain, that we have to deal with 140 two sets of domain name identifiers for a long time ahead. 142 The Romanized Pinyin, Jamo and On-kun systems for CJK character 143 indexing has provided a feasible but partial solution. The currently 144 used complete solution is to go through a software process of both 145 searching tables for possible matches (not exact-match DNS lookups) 146 and, where necessary, dialogue with the users, and arrive at strong 147 candidates for the glyph representation. If this character selection 148 process is organized in a similar way with book indexing system, 149 alphanumeral-digits-digits..., used a North American library, then 150 the indices can be codified using Latin alphabet. The dream of a 151 complete Romanized character system will be reality, sorting and 152 searching international domain names with one set of symbolic 153 representation will be speedy, and exactly matched DNS lookups could 154 be a reality. 156 1.3. Romanized Multi-language Representation 158 Codifing a trade name representation process is not limited to 159 codify a particular ASCII Compatible Encoding method or a particular 160 code mapping from one code standard to another code standard in a 161 technical context. It shall codify one set of symbols, or one 162 representation system, and a number of efficient paths to let the users 163 have some freedom to decide how to use the system to express their own 164 trade names in the Internet context. Though this was the sprit 165 of ASCII standard, it is the time to set more specific paths on how 166 to use ASCII to represent different scripts of spoken languages, or to 167 codify such a representation process, so that the number of paths does 168 not head for combinatorial explosion, as it is the case in Chinese 169 character encoding methods and for Japanese input systems. This is 170 analogous to let students tread out a optimal path on campus before a 171 concrete walk is poured, and it is our time to codify the paths. 173 Representation system for trade names is due to be unified. In fact, 174 writing system unification has been seen with Arabic, Latin and 175 Chinese. Many different spoken language groups use each of them. 176 According to [DeFrancis 1989], human scripts can be organized into 177 three groups for their phonetic characteristics: 178 1. Syllabic systems, for example, Chinese, Japanese, Maya and Yi; 179 2. Consonantal systems, such as Hebrew, Arabic and Indian languages; 180 and 3. Alphabetic systems, including Greek, Latin, Cyrillic, 181 Korean Hangul and English. Alphabetic systems can be unified by 182 embedding some differences under the hat of mnemonic representation 183 of language symbols, so that the French 'u' is permitted to have a 184 different sound value from the English 'u'. 186 Mapping a consonantal system to an alphabet symbol set is, essentially 187 embedding some phonetic differences, using a Latin mnemonic hat. 188 Additionally, there is the question on how to represent the vowels 189 of the language. Turkey has provided an answer to this question, and 190 Library of Congress has implemented extensive set of languages using 191 the same principle [Translit 97]. 193 As to unifying a syllabic system with an alphabet system, two issues 194 need to be addressed. The first is the inclusion of additional 195 character information which can not be expressed with an one-layer 196 type of a flat alphabet system. The second issue is the reversibility 197 from the alphabetic system back to the syllabic system. 199 1.4. StepCode Protocol to Represent Trade Names 201 The proposed solution is called StepCode, for its staircase type 202 architecture in a transliteration procedure. First, it specifies the 203 phonetic differences to be embedded in the representation, where an 204 International Phonetic Alphabet [IPA] description of the embedded 205 differences shall be recorded. Second, if the Romanized embedding 206 is not sufficient to cover the differences, such as tones, 207 suprasegmentals and diacritics, then extend the mapping space to a 208 26x10 table for secondary phonetic elements which can not be embedded 209 under the Latin mnemonic hat. Third, if the 26x10 space is not 210 sufficient, then linearize the symbol by specifying each of its 211 components. This last part may become recursive, or goes down for 212 more steps. 214 This open-ended procedure not only provides a path to unify a large 215 syllabic or character system with an alphabet symbol set, but also 216 ensures that more semantically specific symbols, such as trademarks 217 and logos, can be represented online and sorted for speedy referencing. 218 In addition, the solution tolerates different viewpoints of the same 219 glyph, such that a CJK character may be accessed by Mandarin Pinyin, 220 Cantonese Wade, or Japanese On-kun, Korean Hangul as well as users of 221 the same dialect creating different expressions in viewing the same 222 glyph. 224 StepCode protocol does not open doors for trade name chaos. First, 225 there are finitely many different scripts to support particular 226 dialects and expressions. Second, the protocol provides locally 227 available expressions for users to choose from, which also helps in 228 conforming expressions especially in IDN context. Third, although 229 the process allows users of the same dialect creating different 230 expressions in viewing a glyph, as it has been experienced with 231 over 600 variety of Chinese character encoding schemes in the past 232 three decades, it limits the different views of a glyph to a matrix 233 of one to ten cells on one fixed starting point [Ye95], where 234 variations in such a process become predictable and manageable. 236 Due to its step nature, the representation can (and should) stop 237 for each symbol, as soon as the symbol can be identified within 238 its designated context. For example, the following list of StepCodes 239 for four Chinese characters: 241 xin1qin1jin0 242 zhu2ge1ge0 243 qing1shui1qing0 244 hua2hua2shi0 246 Each of these codes uniquely identify a CJK [CJK] character of a UCS 247 [UCS] code point in CJK section using Pinyin spelling. They all have 248 three parts: the first part is Pinyin spelling of the character; the 249 second part is the digit following the Pinyin. The digit indicates the 250 end of a character spelling and its tone mark. The two parts together 251 is the transliteration of a character. The remaining alphanumeral 252 string following the first digit is the third part of StepCode. They 253 are in the same format of character transliteration, and is the radical 254 part of transliteration. 256 When there is registration calls for the four characters, then the four 257 characters may be combined into one new alphanumeral string: 258 "xinzhuqinghua1212qin1jin0ge1ge0shui1qing0hua2shi0". 259 The list of StepCodes for the above four characters is resulted from 260 two complete iterations of StepCode protocol. 262 Since it is enough for �xinzhuqinghua� to identify a well-known name 263 in DNS system, �xinzhuqinghua1212� for a not well-known name, and 264 "xinzhuqinghua1212qin1jin0ge1ge0� for pin-pointing a rarely known name, 265 it is up to the registrant and the a zone manager to register a DNS 266 identifier to be just right length for the user, and to keep the full 267 record for code reversal process, depends on IETF and ICANN decision to 268 support a duel-record system [Uname][IDNmap]. 270 1.5 StepCode Features 272 The StepCode protocol is fully compatible with DNS specification, yet 273 is mnemonic, friendly multi-language accessible code points, and 274 accommodates mixed script use. 276 1.5.1 Multi-language access of the same UCS code point 278 Similar to the method used for searching books in a library, such that 279 CJK characters may be accessed by different language users. For 280 example, the following four Korean characters may be coded as: 282 U+???? sim0sim0 Hanja 283 U+2fa5 ni0ni0 Hanja 284 U+351a to0t2o0 Hanhul 285 U+3747 mot0m2o2t0 Hangul 286 (Note 1: the transliteration is used in [Translit 97], where the �t�, 287 in �mot� should be consistent to a jamo for a Korean sound value. 288 Note 2: a hangul may not need to be treated as a CJK character. If 289 it is the case, then �to� and �mot� MUST be unique within all hangul 290 symbols.) 292 The two Hanja character are CJK code points used by at least three 293 languages, and Hangul is only used by Korean. When the four characters 294 combined into an DNS name, it takes the following form as its full name: 296 simnitomot0000sim0ni0t2o0m2o2t0 298 so kr-simnitomot0000sim0ni0t2o0m2o2t0.com can be the DNS name or it 299 may be the full name record to be kept at local registrar and be 300 registered with DNS as �kr-simnitomot.com�. StepCode permits different 301 language tags to access the same glyph in [ISO10646]. 303 1.5.2 Multi-script Accommodation 305 SeptCode protocol allows mixed scripts to co-exist. For example, 306 the five Kana, diacritic mark and Kanji from Japanese: 308 U+3055 sa 309 U+30fc 1 310 U+3073 bi 311 U+3059 su 312 U+???? gyo1go0 313 (Note: Only one radical in a Kanji is coded, since the total number 314 of Kanji is much smaller set than Han character set. Thus, one radical 315 to be coded may be enough to guarantee a unique code within Kanji.) 317 Due to more complex decoding for Kanji than that of Kana, a delimiter 318 for the two seems needed, so a digit 0 may be required to end the kana 319 section. Thus the DNS name: �sa1bisu0gyo1go0� may be used. This shows, 320 that StepCode protocol can be adapted to many different mix of scripts, 321 and different languages needs different treatments on their scripts. 323 1.5.3 Fully Compatible with Current DNS 325 From the Chinese, Korean and Japanese example given above, the host 326 parts have no international glyphs but US-ASCII, and can be a valid 327 entry to DNS, and allows standard compression or security treatment 328 compatible with existing hostnames. 330 1.5.4 One Mnemonic System 332 It is one mnemonic system for any scripts in UCS, such that whatever 333 the language that the zone master understands, he can refer to, sort 334 on, and support of a registered IDN name. 336 1.6 Author's Disclaimer 338 This document is a guide for implementing mnemonic StepCode protocol 339 for IDN hostname identifiers in a language specific way. It is not 340 a natural language dictionary of any decree. The sound value 341 assignment of script symbol although balanced among several 342 considerations are not intended in anyway to claim any linguistics 343 expertise. The different scripts used by any one particular user 344 group addressed in the document does not dictate the user groups�� 345 choice of any subsets of [ISO10646] symbols. 347 In addition, the document is bias on five issues: 348 1) The UCS symbol tabulation structure assumed is bias toward CJK users; 349 2) The mnemonic sound value is based on IPA classification; 350 3) The Latin letter value assignment is bias toward English usage; 351 4) The digits value assignment is bias toward Mandarin usage; 352 5) The language tag function is bias toward Indian languages. 354 1.7 Terminology 356 The key words "MUST", "SHALL", "REQUIRED", "SHOULD", "RECOMMENDED", 357 and "MAY" in this document are to be interpreted as described in 358 [RFC2119]. 360 Examples in this document use the notation from the Unicode Standard 361 [Unicode3] as well as the ISO 10646 names. For example, the letter 362 "a" may be represented as either "U+0061" or "LATIN SMALL LETTER A". 364 A non-Roman character also is denoted in its Romanized form and 365 followed by its English equivalent word in <>. For example, �zhong 366 � without reference to Unicode, due to difficulties in pin down 367 all the code points used in this document from UCS table. 369 An IPA symbol is presented in [], while it is referred among text. For 370 example, [c] is for IPA sound value �c�, not Latin letter �c�. 372 StepCode assumes its encoding is language specific, each language as it 373 is defined in [ISO10646], has its mnemonic encoding and is a part of 374 ACE encoding prefixed to ASCII host name only, for example, �kr� for 375 Korean, �ja� for Japanese. The encoding is called �language tag� of a 376 DNS host name (for language tag implementation see [IDNmap] Section 3). 377 The DNS host name with such a language tag is called a "language tagged 378 ACE", or "T-ACE". 380 StepCode converts a list of internationalized characters at a client 381 site into a string of US-ASCII that are acceptable as a host name in 382 current DNS host naming usage. The former are called a list of �IDN 383 identifiers� or a "glyph" for a symbol represented by one code point 384 in [ISO10646] or "glyphs" for a string of glyphs and the post-converted 385 ASCII string is called a "DNS identifier". 387 [Nameprep] defines Unicode characters mappings, normalizing and 388 exclusions of internationalized host names. The characters from input 389 and in mapping and normalization list is called �IDN-label�, or IDN 390 input, which includes symbols mapping to null. IDN-label is a super set 391 of IDN identifiers in term of UCS code points. 393 The "IDN-label" at a client site may be represented by Unicode, GB code, 394 JIS code, BIG5 and others which may contain equivalent information. 395 These code forms are referred as language specific "localized code 396 points", or �local display codes�. 398 A large script such as CJK or UCS can be classified into three glyph 399 groups: 400 1) IDN letters: which can be directly mapped onto an alphanumeral 401 symbol under the Latin mnemonic hat, for example, Bopomofo, Kana, 402 Arabic, Bengali, Hebrew, Jamo, diacritics, etc. 403 2) IDN radicals: a minimum number of frequently used glyphs which are 404 also used as radicals in other glyphs, and often has independent 405 pronunciation, for example, U+2f00 to U+2fd5, U+2e80 to U+2ef3, and 406 others scatted in CJK Plane 0 blocks; 407 3) IDN icons: the rest of the glyphs in the script, for example the 408 majority code points of CJK, enclosed alphanumerices, enclosed CJK 409 letters and ideographs. 411 The protocol uses US-ASCII to denote the phonetic elements of 412 a script and calls for standardizing such a mapping for each 413 language tag. The phonetic elements of a glyph is called "spelling" 414 of the glyph and is called "stem" for that of a radical. 416 StepCode procedure may have more than two complete iterations. 417 The first iteration is called �character transliteration� though 418 it may take in more linguistic defined elements in such a conversion 419 than a common term transliteration may imply. The second iteration 420 is called �radical transliteration�, for it transcribes radicals of 421 a glyph. The character to transliterated character table is called 422 �tagged section map� [IDNmap Sec. 2.2.3] or �tagged map�, while a 423 transliterated character is called a �StepCode�. The process of 424 converting an input string to T-ACE using a tagged map is called 425 �language tagged procedures� [IDNmap Sec. 4]. 427 According to phonetic nature of world scripts, three groups are 428 referred: Alphabet systems, including Latin, Cyrillic and Greek, 429 Consonant systems, ie. Indian, Arabic languages), and Character 430 Systems, ie. CJK languages. 432 1.8 IDN summary 434 The StepCode is a language dictated flexible ACE protocol and it is 435 complement to the currently proposed, UCS flat treatment ACE. Its 436 coding process reflects �Crowd Control� concepts to better organize 437 character and symbols before they are applicable in IDN system. To 438 deliver it��s full potential and to be more effective, it needs more 439 consensus building among groups regarding code point treatment 440 [Stone], which would be arguable points even a flat UCS code point 441 treatment ACE is deployed alone in any case. 443 2. Host Name Transformation 445 According to [STD13], host parts must be case-insensitive, start 446 and end with a letter or digit, and contain only letters, digits, 447 and the hyphen character ("-"). This excludes any internationalized 448 characters, any font variations, case variations, character set 449 variations, as well as many other characters in the ASCII character 450 repertoire. Further, domain name parts must be 63 octets or shorter in 451 length including any language or other encoding tags. 453 User friendly encoding has to be coherent to users�� native languages, 454 and consequently, host name transformation is dependent to the language 455 tag [IDNmap Sec. 3] selected. As a StepCode encoding guide, the 456 following discussion is focused on four different language groups: 457 Alphabet systems, Consonant systems, Character systems and mixed script 458 systems, from the simplest to more complex ones, and start with a 459 general description of StepCode syntax. 461 2.1 StepCode Syntax 463 A Stepcode unit is a string of [A-Za-z0-9] letters without any white 464 spaces, BLANK, in between. For each StepCode unit, there are data 465 elements indicated by "", which is a MUST supplied element, and [] 466 where the element is optional, and / where the data is selectable. 468 Sx stands for primary sound value or spelling of xth glyph; 469 Tx stands for secondary sound value or tone of xth glyph; 470 Ry stands for Stem for yth radical; 471 Ly stands for Layout relation from radical y to y+1; 472 Rx.y stands for Stem for Xth glyph and its yth radical; 473 Lx.y stands for Layout relation from Xth glyph and its radical y to y+1. 475 2.1.1 One glyph 477 A code point or a glyph in UCS can be an IDN letter, an IDN radical or 478 an IDN icon. Where an IDN letter are phonetic symbols in its native 479 language context marked by a language tag. For example Kana are IDN 480 letters in Japanese context. An IDN radical is an independent glyph often 481 used as a component of another glyph, or a glyph in a foreign language 482 context. For example a simple Han character or a Han radical (U+2e90 �C 483 U+2ef3), a Greek letter in Chinese context. An IDN icon is a composite 484 glyph displayed in one display unit, normally a two dimensional square 485 area. The majority of CJK characters are IDN icons. IDN icons can be 486 viewed as compositions in terms of radicals, or IDN letters. 488 StepCode is language context sensitive transliteration of UCS code points. The 489 The following is formal definition and examples of StepCode for a glyph. 490 The minimum code for a StepCode is one ASCII letter: 491 "S"[T][P1][L1][P2][L2]...[Py][0/BLANK] 493 Thus, the following are examples of IDN letters, radicals and icons: 494 IDN letters: 495 A a 496 U+00c2 a6 497 U+0a98 gha 498 U+0a84 u1 499 IDN radicals: 500 U+03b1 alf0 501 U+2f26 U+5b50 zi3z0 502 U+2f24 U+5937 da4d0 503 U+2f29 U+5b0f xiao3x0 504 U+2f25 U+5973 nv3n0 505 IDN icons: 506 U+2639 :-(0 507 U+263a :-)0 508 U+5b59 sun1zi1xiao0 509 U+597d hao3nv1zi0 510 U+5c16 jian1xiao2da0 512 Where the Unicode are IDN identifiers, the ASCII code column is 513 corresponding transliterated StepCode, or DNS identifiers and the 514 phonetic system used is in Chinese Pinyin. 516 2.1.2 Glyphs 518 A string of glyphs is considered as one unit with only alphanumeral: 520 "S1S2S3...Sx"[T1T2...Tx][P1.1][L1.1][P1.2][L1.2]...[P1.y][0] 521 [P2.1][L2.1][P2.2][L2.2]...[P2.y][0] 522 ... 523 [Px.1][Lx.1][Px.2][Lx.2]...[Px.y][0/BLANK] 525 Example of glyphs: 526 Latin AaA^a aaa6a 527 Gujarati U+0a98 U+0a84 gha + u1 -> ghu1 528 Chinese U+597d U+5b0f U+5b50 haoxiaozi333nv1zi0x0z0 530 StepCodes are language specific. The above examples are from three 531 language groups with common mix of symbols from the same languages. 532 Where the Latin example has included capital letter A Circumflex, which 533 is mapped to digit 6. 535 Gujarati letter GHA has an implicit vowel �a�, due to transliteration 536 rule, when another vowel following the consonant the implicit vowel is 537 replaced. 539 Chinese phrase in the above example shows a mix of IDN 540 radicals and icons encoding, where the first three digits indicate 541 three characters in the unit, and three radical transliterations 542 immediately follow. 544 2.2 Glyph Boundary Marks 546 Most script transliterations are mapped to alphabet system consistent 547 with consonant-vowel-terminal structure. The majority of �glyph to 548 glyph sequence� and �glyph sequence back to glyph� can be done with 549 minimum amount of linguistic rules embedded in glyph sequence 550 composing and decomposing procedures. 552 There are always exceptions to any rules in linguistics. For example, 553 the uses of �-� in Chinese and Korean, the uses of ���� in French and 554 Chinese, the uses of letters �ZWNJ� in Arabic, and the use of �|� in 555 Tibetan and Devangari to prevent two units to join, are complements 556 to the consonant-vowel-terminal rule. 558 In DNS system, only hyphen �-� is allowed for this purpose, and there 559 may be more than one levels of disjoints a host name of a script has to 560 differentiate. It is RECOMMENDED to consider an unused or non-conflict 561 letter first before the �-� has to be used in the transliteration of a 562 language tagged script. For example, the ���� in Chinese Pinyin may be 563 mapped to the letter �v� instead of a hyphen �-�. 565 2.3 Encoding Steps 567 StepCode starts at a phonetic representation of a glyph with ASCII 568 letters and a digit when it in need. This character transliteration 569 has two phases as in Sec. 2.1.1 IDN letter examples: 570 S1.1. Romanize the primary phonetic characteristic of a 571 glyph/phrase; 572 S1.2. Supplement the secondary phonetic characteristic of the 573 glyph with a digit/digits. 575 The second step of StepCode is applied to components of each glyph, 576 radical transliteration, in the same way specified in S1.1, and shown 577 in Sec. 2.1.1 IDN icon examples. 578 S2.1. Romanize the primary phonetic characteristic of a radical, B; 579 S2.2. Specify how the next radical is related to the current 580 radical, B, with a digit; 581 S2.3. If the radical contains another radical, X of B, 582 then go to S2.1 of X (and it is S2+1.1(X)); 583 otherwise go to the next radical, B+1. 585 2.4 Transliteration Schemes 587 Language is creation of human thoughts, which wanders everywhere 588 disregard boundary. StepCode above is a rigid passageway, which only let 589 the properly formed traffic to go through. While an alphabetic script 590 structurally appears closest to Latin alphabet, a few general issues are 591 common to all transliterations. The first issue is which transliteration 592 should be implemented. Unicode Consortium has given each symbol a Latin 593 name for ease in reference. Such a name contains the main sound value of 594 the symbol, but usually more than what is needed in a transliteration. 595 For example, Cyrillic letter BE has sound value �b� in Latin, and it is 596 transliterated in [Translit 97] as a �b�. This introduces transliteration 597 modification #1 to Unicode, that the sound value of a glyph MAY be 598 extracted from its Latin name from UCS standard. 600 2.4.1 Basic Phonetic Classifications 602 It is RECOMMENDED that when consulting publications on character 603 transliteration, the IPA [IPA] definition SHOULD be the primary classes 604 to be considered. IPA class is an artificial grid over an analog 605 spectrum. For each class there is a focus sound with a Latin letter 606 label, and its neighboring sound values slide into its neighboring 607 sound classes. It has the best classification on human language sound 608 values available and its focus sounds are labeled with Latin alphabet 609 letters. [Translit 97] has provided 54 romanization and transliteration 610 schemes, and SHOULD be one of the base transliteration document. 612 2.4.2 Fuzzy Sound Value to Base Class Mapping 614 Whence a sound value can be described with an IPA class, then a 615 proximate letter representation can be referred. Transliteration 616 Modification #2 is to consider a letter assignment in term of IPA class. 617 It is RECOMEMDED that when alphabet is used to represent a sound value 618 in a script, a balance between the current use of a letter in the same 619 script and common uses of the same letter in other languages shall be 620 found. The following is a comparison table of fricative alveolar-palatal 621 letter sound assignments of a group of sampled languages. The table is 622 expended a little into Plosives, Post-Palatals and Approximants for 623 different sound value comparison with Arabic, Hindi, Vietnamese and 624 Chinese languages, and also is used as illustration of the nature of IPA 625 classification. 627 Table header are IPA category represented as: 628 Alveolar Alveo 629 Postalveolar Postalv 630 Retroflex Retrof 631 Alveolar-Palatal Alv-Pal 632 Front Palatal FrontP 633 Palatal Pala 635 Plosive Plos 636 Affricative Affr 637 Fricative Fric 638 Approximant Approx 640 Languages tagged as: 641 Chinese zh- 642 Arabic ar- 643 Deutsch de- 644 English en- 645 Esperanto eo- 646 French fr- 647 Latin la- 648 Hebew he- 649 Japanese ja- 650 Korean ko- 651 Hindi hi- 652 Lao lo- 653 Russian ru- 654 Spanish es- 655 Serbo sr- 656 Tamil ta- 657 Urdu ur- 658 Vietnamese vi- 660 The IPA symbol entries: 661 U+0283 sh Latin Letter esh 662 U+0292 zh Latin Letter yogh 663 U+0282 s2 Latin Letter s hook 664 U+0290 z2 Latin Letter z Retroflex hook 665 U+0255 c3 Latin Letter c curl 666 U+0291 z3 Latin Letter z curl 667 U+029d j1 Latin Letter crossed-tail j 668 c U+0327 c1 Latin Letter c cedilla 670 Alveo Postalv Retrof Alv-Pal FrontP Pala 671 -------- -------- -------- -------- ------- ----- 672 Plos t d t2 d2 c 673 -------- -------- -------- -------- ------- ----- 674 ar-T ar-D 675 he-T 676 hi-t hi-d hi-T hi-D 677 hi-th hi-dh hi-Th hi-Dh hi-kh hi-gh 678 sr-c 679 vi-ch vi-c 680 ur-T 681 -------- -------- -------- -------- ------- ----- 682 Affr ts dz tsh dzh ts2 dz2 tc3 dz3 tc1 dj1 683 -------- -------- -------- -------- ------- ----- 684 zh-z en-ch en-j zh-zh zh-j 685 zh-c ar-ch zh-ch zh-q 686 de-ch 687 eo-cx eo-gx 688 he-ts he-ch 689 ja-ts ja-ch 690 ko-ch ko-tch/jj ko-gg 691 hi-c 692 hi-ch 693 lo-ch 694 es-ch 695 sr-dz sr-ch sr-dz2 696 sr-ts�� 697 ru-ts ru-ch ru-zh 698 ur-z ur-zh 699 -------- -------- -------- -------- ------- ----- 700 Fric s z sh zh s2 z2 c3 z3 c1 j1 701 -------- -------- -------- -------- ------- ----- 702 zh-s en-sh en-as zh-sh zh-r zh-x 703 ar-s ar-z ar-sh ar-zh ar-S ar-Z ar-H 704 de-s de-z de-sch 705 eo-s eo-z eo-sx eo-jx 706 fr-s fr-z fr-sh fr-je 707 he-s he-z he-sh 708 ja-s ja-z ja-sh ja-j 709 ko-s ko-ss ko-j 710 hi-s hi-sh hi-S 711 lo-s 712 es-s/c es-z 713 sr-s sr-z sr-sh sr-zh 714 ru-c ru-sh 715 vi-x vi-d vi-s 716 ur-s ur-z ur-sh 717 -------- -------- -------- -------- ------- ----- 718 Approx j 719 -------- -------- -------- -------- ------- ----- 720 hi-r hi-j 721 hi-l ta-l ta-L ta-l2 hi-jh 722 ta-r ta-N 723 ur-R 724 -------- -------- -------- -------- ------- ----- 725 Alveo Postalv Retrof Alv-Pal FrontP Pala 726 -------- -------- -------- -------- ------- ----- 727 Table 1. Romanized Latin letter assignments found in contemporary text 728 books, bilingual dictionaries and [Translit 97]. 730 More notes on table entries: 731 The entries under column headers are in unvoiced vs. voiced pairs. 732 The entries of the same column with a same language tag are non-aspirated 733 and aspirated pairs in two rows, for example: 734 hi-c 735 hi-ch 736 The uppercase letter assignments are taken from certain text books, 737 where the transliteration takes several forms: doubling letters (common 738 in text books), a dot under a letter(Library of Congress) and a 739 capital letter (IPA convention). 741 Particular languages often have several sounds falling into the same 742 class, or under the neighboring classes of IPA table, but very few under 743 other labels. This phenomenon is can be found in above, Table 1. It is 744 RECOMMENDED to follow conventional use of neighboring labels to 745 differentiate the value concentrated classes, provided it does not 746 conflict with other sound values which are already stable assignments. 747 Some language transliterations supplementing a secondary letter to the 748 label in focus often achieve satisfactory results, for example �ja-sh�. 750 From the tabulated 18 language transliterations in Table 1, and 751 considering the conventional transliteration practice shown in the table, 752 the following sound value convention is RECOMMENDED: 754 Doubling vowel for a long vowel sound, (mostly used in Arabic) 755 Doubling consonant for sound produced from back position (Arabic, Hindi) 756 sh for U+0283, Latin Letter esh (All in the table) 757 j for U+0292, Latin Letter yogh (most in the table) 758 zh for dj/dz as an alternative for conventional dj and dz, it appears 759 quite popular in non-Roman languages. 760 ch t U+0283 (Almost all in the table have done so.) 761 c c/ts (Though existing TS is common, but a ��c�� is a clear favor 762 for simplicity, provided that [c] is covered under ��k��.) 763 h as an attachment letter for aspirated sound (as in Hindi). 764 n for nasalization, it is hard to separated from [n], as ��n-��, so a 765 diacritic is RECOMMENDED.) 766 k for [c],[k],[q] (It is rare to differentiate all the three in a 767 language. When it has such a need, a ��kk�� accomplishes the task 768 as it��s in Korean.) 770 Since most of the transliteration data of Table 1 is from English 771 literature, the recommendation above clearly is bias toward English 772 speakers. The bias is based on two reasons. The first is technical, that 773 common English does not use diacritical marks, so that it is a better 774 base scheme for adapting other language symbols which often use 775 diacritics. The second reason is the fact shown, in Table 2, that English 776 is the highest in number of population, as non-native language used in 777 the world currently. 779 The principle languages of the world �C 781 Source: S. Culbert, NI-25, University of Washington, Seattle, 782 WA 98195, USA; Data as of mid-1993 [WORLD 95] 783 Languages spoken by more than 100,000,000 people: 785 Native Non-native Total 786 Mandarin - 836 126 952 787 Hindi - 333 418 788 Spanish - 332 381 789 English - 322 148 470 790 Bengali - 189 196 791 Arabic - 186 219 792 Russian - 170 118 288 793 Portuguese - 170 182 794 Japanese - 125 126 795 German - 98 121 796 French - 72 124 797 Malay-Indonesian - 50 105 155 799 Table 2. Top four non-native languages used in the world: English, 800 Mandarin, Russian and Malay-Indonesian. 802 2.5 Alphabetic Script Transformation �C Mechanical Methods 804 Transliteration is mostly table lookups with minimum rules to implement. 805 Although alphabetic script transliteration is simplest, it is the place 806 to specify transliteration table format and a few basic concepts and 807 basic decision points in StepCode implementation, such as which phonetic 808 system shall be selected, which foreign symbol set to be included in a 809 language tagged script range [IDNmap] and how to include a foreign 810 symbol or a symbol set. 812 2.5.1 Transliteration Tables 814 Transliteration table usually contains two columns. To make referencing 815 easy for a layman, it is RECOMMENDED that transliteration tables contains 816 at least four columns: ASCII symbol, UCS glyph, IPA sound value, and 817 examples of spoken words of the language as shown in Table 3, with 818 necessary comments. 820 ASCII UCS IPA Example 821 ru- 822 a U+0430 U+0251 : matb 823 b U+0431 b co6aka 824 v U+0432 v 825 g U+0433 g 826 d U+0434 d 827 e U+0435 e 828 j U+0436 U+02a4 829 z U+0437 z 830 i U+0438 i: 831 y U+0439 i 832 k U+043a k 833 l U+043b l 834 m U+043c m 835 n U+043d n 836 o U+043e U+0259 837 p U+043f p 839 r U+0440 r 840 c U+0441 s (��s�� in [Translit 97]) 841 t U+0442 t 842 w U+0443 u: 843 f U+0444 f 844 x U+0445 x (��kh�� in [Translit 97]) 845 ts U+0446 ts 846 ch U+0447 U+02a7 847 sh U+0448 U+0283 848 sch U+0449 U+0283 U+02a7 (��shch�� in [Translit 97]) 849 q U+044a (slilent) 850 h U+044b U+0263 851 q U+044c (soften the last consonant) 852 a U+044d U+00e6 853 iu U+044e ju: 854 ia U+044f j U+0251 : 856 Table 3. Russian Transliteration Table. 858 The third and forth columns are convenient references to phonetic data 859 threads online. 861 Structurally, alphabetic script is similar with Latin, where some letters 862 may represent different sound with Latin letter. For example, in Table 3 863 [Russian 44] the letter ��x�� and ��c�� are kept as Cyrillic letter, but in 864 [Translit 97] they are transliterated to ��kh�� and ��s�� respectively. Since 865 the letters used here do not present conflict assignment with other 866 letters, it is in the best interests of the native speakers to decide 867 which version shall be used as DNS identifiers. 869 2.5.2 Mixed used of Alphabetical scripts 871 The major alphabetical scripts are Latin, Greek and Cyrillic, with very 872 few cases using symbols from another script, for example �AGAPE� is Greek 873 in Latin script, not in Greek script. It is RECOMMENDED to have three 874 languages tags: la-, el- and ru- for Latin, Greek and Cyrillic, as three 875 respective primary language tags [IDNmap] for alphabetic scripts. 877 If an English user wants to include a symbol from Greek, he has to wait 878 for Latin tag to include Greek code block as its second script, if there 879 is enough demand for such a service. In this case, there are two methods 880 to include the transliteration table for Greek symbols in Latin tag. 882 The first one is to use a digit to indicate the second script set, as in 883 column 1 of Table 4, and is called �Overflow Symbol Mapping�(Section 3.3), 884 for simplicity in mechanical filling with a second set of symbols. 886 The second method is called �Radical mapping� is shown in column 2 of 887 Table 4. The name �radical� for Greek symbol is an analogy to radicals in 888 CJK, for a Greek letter has a sound and a name and can not be decomposed. 889 That is it is not a composite glyph, nor can it be sub-divided. They are 890 treated in the similar way with CJK character set in a foreign language. 892 A secondary script attached to Latin language tagged section map: 894 la- 895 a U+0061 896 b U+0062 897 ... 898 z U+007a 900 a9 alf0 U+03b1 901 b9 bet0 U+03b2 902 c9 gam0 U+03b3 903 d9 del0 U+03b4 904 e9 eps0 U+03b5 905 f9 zet0 U+03b6 906 g9 eta0 U+03b7 907 h9 the0 U+03b8 908 i9 iot0 U+03b9 909 j9 kap0 U+03ba 910 k9 lam0 U+03bb 911 l9 mu0 U+03bc 912 m9 nu0 U+03bd 913 n9 xi0 U+03be 914 o9 omi0 U+03bf 915 p9 pi0 U+03c0 916 q9 pho0 U+03c1 917 r9 fsi0 U+03c2 918 s9 sig0 U+03c3 919 t9 tau0 U+03c4 920 u9 ups0 U+03c5 921 v9 phi0 U+03c6 922 w9 chi0 U+03c7 923 x9 psi0 U+03c8 924 y9 ome0 U+03c9 926 Table 4. Two methods to expend the Latin script. 928 The pros for Column 1 is short and regular, provided the digit 9 is not 929 assigned to something else. The cons is hard to remember which letter 930 of Greek is in that Latin letter position. 932 The second method shown in Column 2 is easy to remember since a Greek 933 letter is mostly spelled out in a syllable ( and can be mapped according 934 to its sound value instead of the mechanical flooding as they are in 935 Table 4), but is harder for a program to tell the character boundary. 936 The few options are available for amending the radical mapping 937 implementation: 938 1) Filling the short name up to make all the Greek symbols with uniform 939 length, say 3 letters. By recognizing digit 0, the decomposing procedure 940 can take preceding 3 letters as one symbol, this is called Protocol 941 method. 942 2) Insert another digit 0 before the Greek symbol to mark a foreign 943 symbol, and is called Marker method. 944 3) Insert a hyphen ��-�� before the Greek symbol, to make an independent 945 sub-name unit, and is also a Marker method. 947 The pros for the above IDN radical symbol treatment is it is flexible, in 948 terms of the number of symbols to be introduced, and in terms of naming 949 such a symbol that a native reader understand, also it can be used for 950 trademark encoding when there is such a request. The cons for it is 951 lacking market data to support such an implementation. It is RECOMEMMDED 952 a radical mapping is selected for introduce foreign symbols into a 953 language tag. 955 Assuming the above recommendation is accepted, it is RECOMMENDED to use 956 Method 2) to mark a foreign symbol in a language tag, for it accommodates 957 variable length description of a foreign symbol, it is consistent with CJK 958 symbol treatment discussed in Section 2.7 and it preserves method 3) for 959 users to make individual decisions on their naming. 961 2.6 Consonant Script Transformation �C Developmental Issues 963 The name for this group of scripts may not be accurate, it just as 964 well be called as the �rest of scripts� besides Euro and Han scripts. The 965 main concern in treating this group of scripts is treating each script 966 independently and not let any rules made now develop into extreme in a 967 near future. For example, one extreme is to forbid any new symbols to 968 enter a language tagged range, the other is open up the whole UCS for one 969 language tag. The Hindi language section map is selected here to examine 970 implementation issues, since it reflects some of the reality in that user 971 sector as well as in the engineering sector regarding language tag design 972 issues [Stone]. 974 hin- 976 7 U+0901 (nasalization) 977 U+0902 (no decision) 978 U+0903 (no decision) 980 a U+0905 U+028c 981 aa U+0906 U+0251 : 982 i U+0907 I 983 ii U+0908 i: 984 u U+0909 U+028a 985 uu U+090a u: 986 ri U+090b ri 987 lri U+090c lri 988 e U+090d e 989 e U+090e e 990 e U+090f e 991 ai U+0910 U+00e6/aI 992 o U+0911 U+0259 U+028a 993 o U+0912 U+0259 U+028a 994 o U+0913 U+0259 U+028a 995 au U+0914 U+0254 : / a U+028a 997 k U+0915 k 998 kh U+0916 x 999 g U+0917 g 1000 gh U+0918 g' 1001 ng U+0919 U+014b 1002 c U+091a U+02a7 1003 ch U+091b U+02a7 ' 1004 j U+091c j 1005 jh U+091d j' 1006 ny U+091e ni 1007 tt/T U+091f U+0288 1009 tth U+0920 U+0288' 1010 dd U+0921 U+0256 1011 ddh U+0922 U+0256' 1012 nd U+0923 nd 1013 t U+0924 t 1014 th U+0925 t' 1015 d U+0926 d 1016 dh U+0927 d' 1017 n U+0928 n 1018 nn U+0929 n (for Tamil n) 1019 p U+092a p 1020 ph U+092b p' 1021 b U+092c b 1022 bh U+092d b' 1023 m U+092e m 1024 y U+092f y 1026 r U+0930 r 1027 rr U+0931 r (for Tamil r) 1028 l U+0932 l 1029 ld U+0933 ld 1030 ll U+0934 l (for Tamil l) 1031 v U+0935 v 1032 sh U+0936 U+0283 1033 ss U+0937 U+0282 1034 s U+0938 s 1035 h U+0939 h 1037 q U+0958 q 1038 khh U+0959 q' 1039 ghh U+095a G' 1040 z U+095b z 1041 dddh U+095c U+0256 d' 1042 rh U+095d U+0280 1043 f U+095e f 1044 yy U+095f y: 1046 U+093a 1047 U+093b 1048 U+093c 1049 U+093d 1050 aa U+093e U+0251 : 1051 i U+093f I 1052 ii U+0940 i: 1053 u U+0941 U+028a 1054 uu U+0942 u: 1055 ri U+0943 rI 1056 rii U+0944 ri: 1057 e U+0945 e 1058 e U+0946 e 1059 e U+0947 e 1060 ai U+0948 U+00e6 / aI 1061 o U+0949 U+0259 U+028a 1062 o U+094a U+0259 U+028a 1063 o U+094b U+0259 U+028a 1064 au U+094c U+0254 : / a U+028a 1066 Table 5. IDN Hindi section Map [Hindi 98]. 1068 Observations of Table 5: 1069 1) It has no example word column; 1070 2) It has not made decisions on several code points; 1071 3) It has adopted three Tamil symbols; 1072 4) the extra long vowel sound is indicated by doubling the vowel letter; 1073 5) the retroflex sound is indicated by doubling the consonant letter, 1074 while other forms exist, such as uppercase letter or an under letter 1075 mark as they are shown in Table 1 and [Translit 97]; 1076 6) the aspirated sound is indicated by letter ��h�� instead of an apostrophe 1077 ���� used in [IPA]; 1078 7) the symbol transliteration is not mechanical mapping, it needs 1079 linguistic rules to composing and decomposing a transliterated Latin 1080 string for Hindi. 1081 8) the nasalizing sign, Devangari Sign Candrabindu, is mapped to digit 7, 1082 since it is the last diacritical mark used in [Translit 97]. The 1083 under-letter marks either have been reflected in Table 5, or ignored 1084 due to implicit transliteration of Table 5; 1085 9) the section �U+093e - U+094c� are equivalent to section �U+0905 �C 1086 U+0914�, the section of symbols are not treated separately in 1087 [Translit 97]. These symbols could be included in canonicalizing 1088 procedure specified in [Nameprep] but dependent to input code 1089 processing. 1091 Each of the observations flags a developmental issue: 1092 1) Concerning the IDN as a long term solution or a short term fix. If this is 1093 a long term solution, then to fill up the column will benefit long term 1094 reference, there is no need to revisit the same issue when the reference 1095 is organized for later comers. 1096 2) The assignment of 10 digits has to consider its common meaning to 1097 other languages so that, there is conformity semantics for less confused 1098 implementation and long term use. 1099 3) Implies that Tamil language often appears among Hindi speakers. It is 1100 RECOMEMMDED to consider inclusion of one to two other scripts for each 1101 of languages in Consonant language group in the future IDN releases. 1102 4), 5) and 6) are differences with [Translit 97] implementation. Advantages 1103 of this implementation is not over-load diacritical marks and is more 1104 reader friendly, with easier linguistic interpretation. Disadvantage is 1105 using variable length of Latin letters for each Hindi symbol. 1106 7) As result of 4) 5) and 6), more linguistic understanding is required 1107 in implementation of a language tagged procedures. 1108 8) With the more reader friendly treatment of Devanagari shown in 4)-7), 1109 there are enough digits to be used for other aspects of the linguistic 1110 issues, such as boundary, nasal, tonal or stress marks. 1111 9) Case mapping is a common issue, which can be applied equally to 1112 Latin, Chinese, Japanese, Hindi as well as whatever there are such 1113 requests, and which have been defined by their primary users. In any 1114 case, the Hindi case mapping requires a better understanding of how the 1115 symbols are used at the user end both from keyboard, as well as keyboard 1116 signal to text transformation and local code exchange standard. When 1117 such an expertise is not available, there is still no base for exclusion 1118 for such a case mapping in IDN. 1120 2.7 Character Script Transformation �C Feasibility 1122 The commonly used symbol set for Chinese, Japanese and Korean is around 1123 4000 characters each, with some differences in forms, while majority of 1124 the symbols in each set over lap with the other two. Access of the 4,000 1125 characters is a headache if one has to select from a table of 4,000 1126 character without some efficient indexing system. For UCS CJK character 1127 set, the issue is to address over 21,003 characters using one primary 1128 language tag. 1130 For languages with a large number of glyphs, such as CJK set and is 1131 impossible to map onto a Latin alphabet directly, a three layered scheme 1132 is RECOMMENDED, and a minimum set of glyphs of a script which are often 1133 used as parts of other glyphs are CJK radicals SHOULD be derived. 1135 In the IDN system, the IDN letters include Bopomofo, Kana, and Jamo 1136 phonetic symbol sets. Since these systems all have been used, has stable 1137 transliterations standards to refer to, and have been discussed in 1138 previous sections, in this section the discussion will be focused on 1139 radical transliteration. 1141 2.7.1 Character transliteration Scheme for IDN Radicals 1143 Radical are building blocks of CJK character set. Radicals are independent 1144 symbols with semantics and pronunciation or names. For example, 1146 Unicode Short form Long form 1147 U+03b1 alfa0 1148 U+5b50 zi0 zi3z0 1149 U+5937 da0 da4d0 1150 U+5b0f xiao0 xiao3x0 1151 U+5973 nv0 nv3n0 1153 are five radicals, where the first part of each code is the name of the 1154 radical, the second part as they are shown in the last column is its 1155 primary sub-radical name letter. Mandarin has 417 sounds with average 4 1156 tones each, total covers basic radical set of 1,500. With 25 letters 1157 before the delimiter 0, theoretically it is enough to give 23,000 UCS 1158 characters unique index. However, it is not enough to give each character 1159 a unique mnemonic name to facilitate users�� access. 1161 With the fast expansion of memory chips and transmission speed in the last 1162 10 Years, vast amount of data can be stored at any local chips for fast 1163 references. It is doubtful to design an index system concurs to above 1164 theory is wise. Instead, user friendly configuration should have the 1165 highest priority, and a complete set of data at ease of access shall be 1166 the base for a new IDN design philosophy. 1168 Considering the radical encoding above, although it is enough to have 1169 Pinyin with tone indicator as its transliteration, as zi3, xiao3, da4, 1170 and nv3, it creates a different coding format, such that when they are 1171 mixed with an IDN icon, two different formats require more rules in 1172 processing. For simplicity, IDN radicals takes the same StepCode format 1173 as IDN icons, as shown in the last column on above four examples, which 1174 all end with a digit 0 as delimiter, but include only one letter as 1175 their sub-radical encoding to indicate a simple character with no further 1176 decomposing. 1178 Thus, the longer form of IDN radical transliteration applies when 1) the 1179 radical set is large within a language tag, and the diacritical marks 1180 play a part in the transliteration; 2) the radicals are used with large 1181 IDN icon set, such as CJK, a uniform format with the larger set is 1182 Preferred over code complexity, so the radical is treated as an IDN icon. 1184 The short form of IDN radical transliteration applies, when 1) the radicals 1185 are small set of foreign symbols under a concerned language tag, 2) a 1186 radical is used as radical transliteration of an IDN icon transliteration, 1187 as radical �xiao0� in Han character Sharp, �jian1xiao1da0�. 1189 2.7.2 Radical Naming Convention 1191 Some glyphs in the IDN radical set are most frequently used glyphs by 1192 themselves, some are used by themselves only in a particular language, 1193 yet some of them never stand alone, and their names follow naming 1194 convention which is listed bellow: 1196 "pang" - a radical on the left, �p� for short; 1197 "bian" - a radical on the right, �b� for short; 1198 "tou" - a radical on the top, �t� for short; 1199 "di" - a radical on the bottom, �d� for short; 1200 "xin" - a radical in the middle, �x� for short; 1201 "kuang"- a container or an enclosure radical, �k� for short. 1203 Since CJK characters are written from left to right and top-down, 1204 often the "pang" is the first radical of a character to be used as the 1205 key for searching into dictionaries and is partially listed in UNICODE, 1206 so "pang" has the most number of them appear in an index table in a 1207 regular Han dictionary. 1209 2.7.3 CJK Character Coding Process 1211 CJK Character coding process reflects �Crowd Control� concepts: 1)survey 1212 Requests �C sorting, 2) select leaders �C identify equivalent cases, 3) 1213 mark directions �C mnemonic encoding, and 4) divert traffic �C leave out 1214 individual issues out for other applications. The principle applies to 1215 other UCS symbol transliteration encoding processes as well. 1217 The naming process SHOULD reflect a user��s viewpoint, not a programmer��s 1218 viewpoint. The following radical transliteration procedure is RECOMMENDED: 1219 1) Sort all the characters, include IDN icons, by Romanized names, which 1220 is Pinyin for Chinese, or a Latin symbol name in UCS; 1221 2) Delete all polyphones of a character but leave one as the IDN 1222 identifier; 1223 3) Sort all the homophones by frequency of usage counting both as a 1224 radical and as an IDN icon, and obtain a sorted list on frequency of 1225 usage, for example: 1226 fei-20 fei-8 fei-3 fei-2 fei-1 1227 4) Move the hard to decompose character to the front, and suppose fei1-3 1228 is such a character, then 1229 fei1-20 fei1-3 fei1-8 fei1-2 fei1-1 1230 5) Adjust homophone and polyphone characters as needed for easy coding 1231 discrimination; 1232 6) Code each of the above symbol in the order prepared above: 1233 fei1-20 fei1f0 (radical) 1234 fei1-3 fei1b0 (radical) 1235 fei1-8 fei1nv1yi0 1236 fei1-2 fei1caot2fei0 1237 fei1-1 fei1ko1fei0 1238 such that the front radical or character gets a shorter name; 1239 7) Identify semantically equivalent character set, and assign only one 1240 character per set to IDN identifier. 1242 Additional care MUST be applied in above process for future application 1243 system developments: 1244 1) Reserve the polyphones opted out from Naming Process 2) and 5) above 1245 for other applications, for example user input processing, not 1246 discussed in IDN-map [IDNmap] but indicated in [SLS]. 1247 2) Reserve the members of semantically equivalent character set from 1248 Naming Process 7) above for other applications, for example IDN name 1249 display processing, which are not discussed in IDN-map [IDNmap], but 1250 indicated in [SLS]. 1251 3) For non-character radicals one may fall onto in Naming Process 6), 1252 a multi-syllabic name may be shorten with conventions specified in 1253 Section 2.7.2, for example, �cao zi tou� is shorten to �caot� in 1254 �fei1-2 fei1caot2fei0 � above. 1256 It is RECOMMENDED that the glyph transliteration process of CJK 1257 Characters DOES NOT bind by any particular radical list, which are only 1258 references as historical character decompositions. This introduces 1259 Transliteration modification #3 to UNICODE document, CJK radicals and 1260 radical supplement: U+2f00 to U+2fd5 and U+2e80 to 2ef3. 1262 Other limitations posted by IDN system application are discussed in 1263 [Stone] Section 3. Observing limitations and follows the above coding 1264 process and sort out equivalent character set phonetically and 1265 semantically is REQUIRED as the first step to tame �A Tangled Web� 1266 [RFC 2825]. 1268 2.7.4 Use of character transliteration 1270 It was a struggle to decide to put a full description of a Han character 1271 as its encoding or as its index, until the recent release of a wrist 1272 watch sized computer. It is clear that such a full description of a 1273 character will benefit symbolic processing greatly. For example, an 1274 automated voiced teaching tool may generate instructions on characters 1275 directly from the transliteration. IDN registration software can extract 1276 a DNS identifier from a full character description if such a holocode is 1277 available for access. For example, from the following IDN radicals and 1278 icons: 1279 U+5b59 sun1zi1xiao0 1280 U+597d hao3nv1zi0 1281 U+5c16 jian1xiao2da0 1282 U+5b50 zi3z0 1283 U+5937 da4d0 1284 U+5b0f xiao3x0 1285 U+5973 nv3n0 1287 It is easy to extract a transliterated word from the first part of the 1288 above listed StepCode, and get the word �haoxiaozi�. It is just as easy 1289 to match the second part, the radical transliteration only, to refer back 1290 to the character��s pronunciation. This is a hint for another type of user 1291 friendly input glyph processing. 1293 2.8 Mixed Script Transformation �CImplementing Japanese Tag 1295 Japanese using different phonetic system, its homophone list would be 1296 different with that of Chinese, but the coding procedure described in 1297 Section 2.7.3 SHOULD be the same. 1299 Section 2.7 concerning keeping one format for two types of characters, 1300 the radicals and icons of the same script. Japanese uses two different 1301 scripts from two script groups, kana and Kanji. Since Kana are IDN 1302 letters, and digits are diacritical marks of the letter preceded and 1303 appear at non-regular places, only digit 0 is reserved as delimiter. To 1304 include a Kanji among IDN letters, the rule of delimiter 0 SHOULD be 1305 applied as discussed in Section 2.5. For example, the Japanese section 1306 map: 1308 U+3055 sa 1309 U+30fc 1 1310 U+3073 bi 1311 U+3059 su 1312 U+???? gyo1go0 1314 Thus the DNS name �sa1bisu0gyo1go0� is readily available to be composed 1315 from these transliterated glyph codes. 1317 3. Numerical Symbol Value Assignments 1319 Though, it can be argued even among native speakers regarding a sound 1320 value of a symbol, the domain name identifiers only have 26 letters 1321 and some reasonable combinations within a script. These are the primary 1322 sound elements of a script in any case. Some changes to the primary 1323 sound elements are conventionally represented by modification marks 1324 to a primary symbol. Some modifications are significant and can be 1325 transcribed by a vowel from an alphabet system such as in Arabic. Others 1326 may be represented by a diacritics, as they are in French. UNICODE has 1327 provided clear separation along this line and some instructions on the 1328 functions of modification marks. 1330 Unicode also has listed more than 64 general diacritical marks, U+0300 to 1331 U+0340, while the use of them in a language is not more than 12 by 1332 [Translit 97], (Hindi 12, Ottoman Turkish 11, Azerbaijani and Telugu both 1333 have used 10). Among the usage, the under-letter diacritical marks can be 1334 reflected in letters by conventional transliteration methods used in 1335 dictionaries and text books as shown in Hindi transliteration Table of 1336 Sec. 2.6, so that none of them will need more than 9 diacritical marks. 1337 It is REQUIRED that digit 0 is reserved as icon delimiter from 1338 diacritical mark functions. 1340 Transliteration modification #4 to UNICODE document is to use a digit 1341 to represent diacritic like features, or secondary sound values of 1342 a script. 1344 A digit has no universal sound value associated to it like that of a 1345 Latin letter. It is a good word separator and a less confusing 1346 diacritical mark than that of a letter. For scripts have frequent use 1347 of diacritics, it is RECOMMENDED to use digit in place of a diacritic 1348 mark in a normalized string. For syllabic scripts, it is RECOMMENDED 1349 to use digits at the end of an IDN identifier to indicate a semantic unit 1350 and the number of IDN identifiers in a transliterated string as shown in 1351 Section 2. 1353 Although 26x10 is a two dimensional map, it can be filled with more than 1354 two phonetic aspects of a script. With increased complexity, the 1355 mnemonic value diminishes gradually. For simplicity, four phonetic 1356 mapping rules SHOULD be observed: R1. Diacritic mark mapping; R2. Phoneme 1357 Mapping; R3. Overflow consecutive slot mapping; R4. Priority elements 1358 mapping. 1360 3.1 Diacritic Mark Mapping 1362 [R1] Graphic based Diacritics mapping. For some scripts a 1363 secondary phonetic elements have to be marked for their users. 1364 For example European scripts, a simple diacritics mapping is 1365 RECOMMENDED, where the digits MAY denote common diacritics, tones 1366 and suprasegmentals. 1368 Tone mark Diacritics 1369 0 no tone voiceless (o) 1370 1 flat/high(-)/long macron (-) 1371 2 global rise (/) acute (/) 1372 3 dip and rising (v) breve (v) 1373 4 global fall (\) grave (\) 1374 5 thrill (~) tilde (~) 1375 6 rising-falling(^) circumflex (^) 1376 7 umlaut( " ) 1377 8 user assign cedilla (hook) 1378 9 user assign user assign 1380 Table 6. General Diacritics Mapping Table 1382 The assignment depends on four factors: 1) current user base with respect 1383 to keyboard assignment, 2) the number of marks in a script from a 1384 published dictionary, 3) IPA [IPA] value, 4) first come and first serve. 1386 The above assignments due to: 1387 1) #0 is reserved as icon delimiter; 1388 2) #1 �C 4 due to common naming as first, second, third and fourth tone in 1389 Chinese; 1390 3) #6 for common Qwerty keyboard assignment; 1391 4) #5 and 7 for frequent appearance in Russian, German, Spanish and 1392 Vietnamese; 1393 5) #8 a place holder for under-letter diacritic mark for Arabic and Hindi 1394 languages. 1395 6) #9 for possible inclusion of Overflow symbol set assignment shown in 1396 Section 2.5. 1398 The position of a similar marks are RECOMMENDED to stay in its 1399 respective position for ease interoperation cross script boundary and 1400 also for users looking for replacement marks. A French diacritical mark 1401 assignment is in Table 6. 1403 French has less than eight but more than four diacritic marks, 1404 it is an example of phonetic mapping [R1]. 1406 fr- 1407 0 no tone 1408 1 Silent or Liaison ' 1409 2 rise/acute (/) 1410 3 (dip/breve is not used) 1411 4 drop/grave (\) 1412 5 thrill/tilde (~) 1413 6 throw/circumflex (^) 1414 7 dieresis (") 1415 8 Supercript or nasal n 1416 9 (not used for French) 1418 Table 7. French Example of Using Diacritics mapping. 1420 The French diacritical mark assignment is an example to demonstrate the 1421 usage of Table 6, not a French tag implementation. The fr- tag format is 1422 used for consistent presentation in this document. 1424 For scripts in consonant system, a subset of marks is RECOMMENDED to be 1425 mapped to ASCII letters as its first choice, while the rest MAY be 1426 assigned a digit. Letters have associated sound values and easier for a 1427 non-native speaker to attach its IPA label association. A digit is better 1428 used for separating a secondary property from its primary sound based on 1429 IPA definitions. An Arabic example assignment is provided in [Mnemonics]. 1431 3.2 Phoneme Table 1433 [R2] Sound based phoneme table mapping, where each digit specifies 1434 a variant of a base phoneme, and a maximum of nine variants may be 1435 accommodated. This rule has a best mnemonic result cross different 1436 scripts. For example, IPA symbol mapping for English in Table 8. 1438 ipa- 1439 0 1 2 3 1441 a U+0251 ae U+00e6 U+0292 1442 b 1443 c ch U+02a7 1444 d 1445 e U+025b .e U+0259 .e: U+025c 1446 f 1447 g 1448 h 1449 i 1450 j U+02a4 1451 k 1452 l 1453 m 1454 n ng U+014b 1455 o U+0252 o: U+0254 1456 p 1457 q 1458 r 1459 s sh U+0283 1460 t th U+03b8 U+00f0 1461 u U+028c U+028a U+0075 1462 v 1463 w 1464 x 1465 y 1466 z zh U+0292 1468 Table 8. Exampe English Phoneme Mapping 1470 IPA symbol mapping for English has used four variants. The Unicode 1471 code point indicates the IPA symbols where an ASCII symbol can not be 1472 found. 1474 A full set of IPA symbol Phoneme mapping is provided in [Mnemonics] for 1475 references. 1477 3.3 Overflowing 1479 [R3] Overflow Symbol mapping - where the symbols SHOULD fill 1480 in only consecutive slots in the opposite directions 1481 in the 26 x 10 table for ease of index computation, where the middle 1482 section of the table SHOULD be left for user selected 1483 definitions. This rule is suited for two sets of corresponding 1484 symbols of the similar scripts, for example Latin and Greek, Indian 1485 scripts. A Chinese version is shown in Table 9 for the method only, not 1486 in any way to suggest such an assignment. 1488 zh- 1489 0 no tone 1490 1 flat/macron (-) 1491 2 rise/acute (/) 1492 3 dip/breve (v) 1493 4 drop/grave (\) 1495 5 classic character drop/grave (\) 1496 6 classic character dip/breve (v) 1497 7 classic character rise/acute (/) 1498 8 classic character flat/macron (-) 1499 9 classic character no tone 1501 Table 9. Example use of Overflowing slot mapping. 1503 The above Overflow and Tone Mark mapping architecture, [R1-R3], 1504 partitions the 26 x 10 table to symmetric two different glyph sets. 1506 3.4 Priority List 1508 [R4] Priority elements mapping - Selecting a set of often used 1509 symbols to be placed in the table. For example: 1511 [R1-R3-R4] 1512 en- 1513 0 a-z 1514 1 flat/macron (-) 1515 2 rise/acute (/) 1516 3 dip/breve (v) 1517 4 drop/grave (\) 1518 5 thrill/tilde (~) 1519 6 throw/circumflex (^) 1520 7 dieresis (") 1521 8 Dingbats 1522 9 A-Z 1524 0 8 (Dingbats) 1525 a U+2604 /*areo or comet*/ 1526 b 1527 c U+24b8 /*copyright*/ 1528 d U+25ca /*diamond*/ 1529 e U+24d4 /*eletron*/ 1530 f U+2709 /*fly*/ 1531 g 1532 h U+2624 /*health or Caduceus*/ 1533 i U+261e /*index or white right pointing index*/ 1534 j 1535 k U+2654 /*king*/ 1536 l U+2661 /*love or white heart suit*/ 1537 m U+2709 /*mail or envelope*/ 1538 n U+266b /*note or Barred eighth note*/ 1539 o 1540 p U+262e /*peace symbol*/ 1541 q U+2655 /*queen*/ 1542 r U+2602 /*rain or umbrella */ 1543 s U+263a /*smile*/ 1544 t U+231a /*time or watch*/ 1545 u U+2328 /*utility or keyboard*/ 1546 v U+260e /*voice or phone*/ 1547 w U+270d /*writing*/ 1548 x 1549 y U+262f /* yinyang */ 1550 z 1552 Table 10. Example use of Priority Mapping. 1554 In fact, example Table 10 is general Latin script assignment, except the 1555 dingbats mnemonic values are keyed on English. DNS name resolver treats 1556 uppercase same as lower case, it provides no additional way for users 1557 to assign any specific value to upper case letters. One way to expand 1558 the symbol set allowed in DNS is to use [R3] as in Table 10. The English 1559 mapping assignment above takes rules [R1-R3-R4]. 1561 The above assignment rules MAY be used in a combination according 1562 to an order of weights in such an assignment. Such an order of weights 1563 SHOULD be specified in the form [Rx-Ry-Rz-R4] in front of a 1564 transliteration table of a language tag in form of comments. 1566 3.5 Digits as Radical Layout Indicators 1568 A unified CJK character is often a composition of several independent 1569 symbols from the script. It is possible to describe a CJK character by 1570 representing a character with only its radicals. Although it can identify 1571 a character uniquely, normally it is accompanied with a number of rules 1572 with too many exceptions for the majority of users to comprehend. 1573 StepCode encoding has reduced the complexity of the rules by considering 1574 a CJK character as a simple grid of 1 to 10 units. Naming the 1 to 10 1575 units in a linear fashion results a linear representation of the glyph or 1576 its encoding. 1578 The order of prioritizing radicals of a character is important. In 1579 general, the radical that one writes it with a pen containing the first 1580 stroke of a symbol in printing manner, which is publicized as part of a 1581 national education system is the �primary radical� of the symbol. For 1582 example the character �xin � (the digit is the tone of the character, 1583 hereafter) has two radicals: 1585 1) �qin1 � + �jin1 � 1587 Since �qin1� may be considered as two radicals as well, the radicals 1588 list may be in the following form too: 1590 2) �li4 � + �jin1� + �mu4 � 1592 or with different radical ordering: 1594 3) �li4 � + �mu4 � + �jin1 � 1596 In this case the �qin1� or �li4� both may be the primary radical 1597 dependent to which viewpoint of the user takes, which may be address 1598 in a different document. StepCode protocol favors 1) as discussed in 1599 Section 2.7. 1601 Variation in Radical transliteration can result in multiple 1602 StepCodes to one character within the same tagged map. It is due 1603 to 1) Radical transliteration is usually used as secondary 1604 representation of a character, however sometimes it may be used as 1605 its primary representation, when the correct sound of a character 1606 is not available to the user. 2) When viewing a character as a grid, 1607 there are disagreements on the number of units in a character. For 1608 domain names, the point of views in describing compositions of a 1609 character for a domain name MUST be limited to only one major 1610 viewpoint. The minor viewpoints SHOULD be converted to the major 1611 viewpoint, and radical transliteration MAY be the key to locate 1612 its character transliteration part through user interface when a 1613 name is registered. 1615 The digits in radical transliteration specifying how a radical of a glyph 1616 on its grid is related to the next radical, are called layout digits. 1617 Layout digits specify the relation to the next radical in line. The left 1618 and right direction are defined by a user's left or right hand while 1619 sitting in front of a display screen or a piece of paper. 1621 The glyph layout digits are: 1622 0 - end of a character or a radical 1623 1 - to its right 1624 2 - to its underside 1625 3 - to contain the following 1626 4 - to divide the following 1627 5 - to its left 1628 6 - to its top 1630 The following selectable digits are to specify additional 1631 glyphs of the script and directions of layout. 1633 7 - to overlay itself with X then to its right; 1634 8 - to overlay itself with X then to its left; 1635 9 - to overlay itself with X then to its underside. 1637 Table 11. Glyph Layout Numeral Values 1639 The radical layout scheme trades complexity of a glyph with code length, 1640 such that the complexity can be left out when an application only needs 1641 the character transliteration. 1643 4. Language Specific Procedures 1645 Either, StepCode may be obtained directly from local display codes to 1646 StepCode phrase conversion tables or to be taken from IDN identifier of 1647 language tagged section maps. Or, it inputs directly from keyboards, 1648 where an input processing module verifies correctness of intended glyphs 1649 and normalizes a StepCode. [Appendix] is an example of such cached input 1650 processing procedure. 1652 Different scripts have different transliterations published worldwide. 1653 These publications are the base for implementing tagged maps and tagged 1654 conversions as discussed in previous sections. 1656 4.1 IDN Input Normalization Procedures 1658 The protocol contains two pairs of conversion and reversion procedures 1659 per language tag supported(See [IDNmap] Section 4.3) and calls for a 1660 minimum number of semantic independent symbols of a language to be mapped 1661 onto a Latin alphabet in a mnemonic manner (Section 2.7.3). The first 1662 pair of conversion and reversion procedures are convert language specific 1663 presentation form to a normalized form and vice versa, named as Normalize 1664 and Present procedures respectively and have been described for 1665 Latin, Arabic and Chinese script implementation in [UAX 15][Bidi][Icdn]. 1667 4.2 DNS Fitting Procedures 1669 The second pair of language specific procedures converts a list of 1670 transliterated symbols to a name unit, either it is a word or a phrase or 1671 an identifier of any kinds, to fit into a desired format for any 1672 artificial goals with restrictions that format has to be reversible back 1673 into the list of transliterated symbols in its corresponding decomposing 1674 procedure. The pair of procedures is called Fitting and Decompose 1675 respectively. 1677 The purpose of assembling a StepCode is to be disassembled at its 1678 end of wire travel and indexing back into a tagged map, such that the 1679 pre-converted local display codes can be retrieved in an equivalent 1680 local display code worldwide. For some StepCode, when a list of 1681 character transliteration is combine into a string, it blurs the 1682 pre-converted symbol boundary, which is significant in their 1683 semantic differences, and interferes with correctly disassembling 1684 a StepCode string. It is RECOMMENDED in such a case, a hyphen, �-�, 1685 is added as the last reserved character separator. 1687 When a post-converted string contains mixed scripts, for example 1688 Japanese domain names, exceeds maximum label length, it is only the 1689 characters with radical transliteration MAY be dropped. The truncated 1690 radical transliteration SHOULD reinsert a digit ��0�� to mark the end 1691 of radical transliteration, or using transmission protocols decided by 1692 network group among servers on how to deal with code length exceeding 1693 the DNS label maximum, or other protocols specific to a language 1694 tag to recover, partial recover or intelligent guesses in preventing 1695 confusion when it is decomposed. 1697 Possible protocols for Fitting/Decomposing procedures depend on the scale 1698 of such format to be placed. 1699 1) Zone records: IDN zoned record keeping at IDN name registration locale; 1700 2) Caches: Cached traffic records at client sites; 1701 3) Exceptions: Exception handling rules implemented by protocols; 1702 4) Markers: Symbolic marker interpretation for specific language tag; 1703 5) Models: Embedded linguistic rule interpretation in Fitting/Decomposing 1704 programming languages. 1706 It is RECOMMENDED, that each language tagged procedure SHOULD specify 1707 which protocol type is implemented and what their effects are for world 1708 wide basic code maintenance. 1710 StepCode string is assembled with orders consistent with keyboard 1711 input, regardless it how it would be displayed on a screen or in 1712 URI [URI]. For some scripts, its character display order may be 1713 rearranged. Such a display order is implied by tagged display procedure, 1714 and is not a part of character transliteration nor a part of radical 1715 transliteration. Layout digits apply to layout directions within a 1716 character space as defined by UNICODE, NOT between characters. 1718 5. Embodiment of StepCode Protocol 1720 Symbolic representation in machine format with mnemonic label for human 1721 readers is a basic technique to improve human control over programs. With 1722 such a control of large name base, many artificial intelligence type of 1723 applications can benefit from it. For example, the mnemonic indexing 1724 system for UCS discussed in [IDNmap] may be extended to sort and index on 1725 trademarks and icons for automatic access needed in [WIPO]. 1727 A very much needed universal keyboard access to the full spectrum 1728 of code points in UCS becomes feasible. Imagine that a user pickups a 1729 language tag from a pull-down window, and then types in the keys from a 1730 Latin alphabet labeled keyboard, gets the typed alphabet showing on the 1731 screen for the first level of input verification, and then looks at the 1732 transliteration to symbol conversion to get the second level, �spelling� 1733 verification. (A dream that the author has had for more than 15 years.) 1735 Since StepCode preserves the complete character information, it is a 1736 holocode scheme of a symbol. From which one may extract a set of radicals 1737 to infer the content of a discourse. For example, by recognizing large 1738 presence of �shui3 � radical, one may infer a water body context. 1739 With such type of inference, a semantic net is not too far for reach. 1741 6. Security Considerations 1743 Much of the security of the Internet relies on the DNS. Thus, any 1744 change to the characteristics of the DNS can change the security of 1745 much of the Internet. Thus, StepCode makes no changes to the DNS 1746 itself. 1748 Hostnames are used by users to connect to Internet servers. The 1749 security of the Internet would be compromised if a user entering a 1750 single internationalized name could be connected to different 1751 servers based on different interpretations of the internationalized 1752 hostname. Thus the restriction of DNS names to a small symbol set is 1753 necessary and effective, where adding any other data format only 1754 opens the security gate to complications. 1756 7.Internationalization considerations 1758 StepCode is designed so that every internationalized hostname part can 1759 be represented as one and only one DNS-compatible string. If there 1760 are two different ways to obtain the same glyph on a display device, 1761 then they are still two distinct hostnames, with no bearing on DNS 1762 security issues. If there is any way to follow the steps in this 1763 document and get two or more different results, it is because of an 1764 error in the domain name registration process, where one domain name 1765 registrar fails to update other domain name registrar servers about a 1766 newly registered and well researched hostname. 1768 StepCode using only [a-z0-9] as the basic symbol set is linguistics 1769 sounding choice. Since the base classification used by IPA is Latin 1770 symbol set, the only authoritative study on the subject. The symbol set 1771 has been successfully applied to majority of languages on earth, and 1772 have been proven an effective set of symbols for people of many native 1773 tones to remember and to map to, shown by existing vast quantity of 1774 national standards and dictionaries. Thus [a-z0-9] is the best set of 1775 symbols to be used for universal mnemonic applications of any kind 1776 involving human records. StepCode is a symbol organization scheme to 1777 connect the symbol set to these applications. 1779 8. References 1781 [ASCII] American National Standards Institute (formerly United 1782 States of America Standards Institute), X3.4, 1968, "USA Code for 1783 Information Interchange". (ANSI X3.4-1968) 1785 [CJK] James SENG and etc. �Han Ideograph (CJK) for Internationalized 1786 Domain Names�, draft-ietf-idn-cjk-01.txt, 11th Apr 2001. 1788 [DeFrancis 1989] John DeFrancis, "Visible Speech - The Diverse 1789 Oneness of Writing Systems", 1989, ISBN 0-8248-1207-7. 1791 [Dictionary79] Beijing Foriegn Language Dept., "A Chinese-English 1792 Dictionary", 1979, BK# 9017.810. 1794 [Icdn] Xiang Deng and Yan Fang Wang, "The Implementation of Chinese character 1795 in IDN", draft-ietf-idn-icdn-00.txt, July 2001. 1797 [IDNReq] Zita Wenzel and James Seng, "Requirements of Internationalized 1798 Domain Names", draft-ietf-idn-requirements. May 2001.) 1800 [IPA] The International Phonetic Alphabet, http://www2.arts.gla.ac.uk/IPA 1801 1996. 1803 [ISO639][ISO639-2/T] ISO/IEC 639-2 2001 Codes for the Representation of 1804 Names of Languages. 1806 [ISO10646] ISO/IEC 10646-1:2000 (note that an amendment 1 is in 1807 preparation), ISO/IEC 10646-2 (in preparation), plus 1808 corrigenda and amendments to these standards. 1810 [Hindi 98] "Hindi & Urdu Phrase Book", Lonely Planet Publications, 1998, 1811 ISBN 0-86442-425-6. 1813 [Translit 97] Barry, Randall K. 1997. ALA-LC romanization tables: 1814 transliteration schemes for non-Roman scripts. Washington: Library 1815 of Congress Cataloging Distribution Service. ISBN 0-8444-0940-5 1817 [PinyinCon] Library of Congress Pinyin Conversion Project, �New Chinese 1818 Romanization Guidelines�, 1819 http://lcweb.loc.gov/catdir/pinyin/romcover.html#7 1821 [Macmillan93] The Macmillan Visual Desk Reference, 1993, 1822 ISBN 0-02-531310-x. 1824 [Mnemonics] Liana Ye, �Mnemonic Symbol Mapping of UCS�. 1826 [RFC 2026] S. Bradner, �The Internet Standards Process -- Revision 3�, 1827 1996, RFC 2026. 1829 [RFC2119] Scott Bradner, "Key words for use in RFCs to Indicate 1830 Requirement Levels", March 1997, RFC 2119. 1832 [RFC2277] "IETF Policy on Character Sets and Languages", 1833 rfc2277.txt, January 1998, H. Alvestrand. 1835 [RFC2396] Tim Berners-Lee, et. al., "Uniform Resource Identifiers (URI): 1836 Generic Syntax", August 1998, RFC 2396. 1838 [Russian 44] "New Russian-English and English-Russian Dictionary", Dover 1839 Publications, New York, 1944, ISBN 0-486-20208-9. 1841 [SIS] M. Mealling & L. Daigle, �Service Lookup System (SLS)� 1842 draft-mealling-sls-00.txt 1844 [STD13] Paul Mockapetris, "Domain names - implementation and 1845 specification", November 1987, STD 13 (RFC 1035). 1847 [RFC2825] L. Daigle, Ed. �A Tangled Web: Issues of I18N, Domain Names, 1848 and the Other Internet protocols�, May 2000, RFC 2825. 1850 [UAX15] Mark Davis and Martin Duerst. Unicode Standard Annex #15: 1851 �Unicode Normalization Forms�, Version 3.1.0. 1852 1854 [UNICODE] The Unicode Consortium, "The Unicode Standard". Described at 1855 http://www.unicode.org/unicode/standard/versions/. 1857 [UNICODE30] The Unicode Consortium, "The Unicode Standard -- Version 1858 3.0", ISBN 0-201-61633-5. Same repertoire as ISO/IEC 1859 10646-1:2000. Described at http://www.unicode.org/unicode/ 1860 standard/versions/Unicode3.0.html. 1862 [URI] Roy Fielding et al., "Uniform Resource Identifiers: Generic 1863 Syntax", August 998, RFC 2396. 1865 [Versions] Marc Blanchet, �Handling versions of internationalized domain 1866 names protocols�, draft-ietf-idn-version-00.txt, October 26, 2000. 1868 [WIPO] �The Role of Technical Measures�, RFC3, 1869 http://wipo2.wipo.int/process2/rfc/rfc3/index.html 1871 [WORLD 95] �The world Almanac and Book of Facts 1995�, ISBN 0-88687-766-0 1873 [Ye95] Liana Ye, "A Language Oriented Chinese Encoding for Multilingual 1874 Computing Environments", in "Proceeding of the 1995 International 1875 Conference on Computer Processing of Oriental Languages", Page 323. 1877 9. Acknowledgements 1879 The author has benefited from special comments and suggestions from 1880 Aaron Irvine, John C Klensin, Eric Brunner-Williams, Erik Nordmark and 1881 William Davis and relevant discussions from IDN Working Group to improve 1882 this document. 1884 10. IANA Considerations 1886 This document requires IANA action for availability of script tag, 1887 and registration for each tag and possibly its sub-field for phonetic 1888 system used, and readiness of associated language specific procedures. 1890 11. Authors' Contact Information 1892 Liana Ye 1893 Y&D ISG 1894 2607 Read Ave. 1895 Belmont, CA 94002, USA. 1896 (650) 592-7092 1897 liana.ydisg@juno.com 1899 Expires March 2002 1901 [Appendix] StepCode keyboard input process for Chinese 1903 /* buff.c StepCode processor interface Copyright Y&D ISG, Inc. 1994 1904 *-----------------------------------------------------------------------* 1905 * find_gly find a glyph online. 1906 * find_wd find a word online. 1907 */ 1909 #include 1910 #include 1911 #include "steplib.h" 1913 int auto_learn= TRUE; 1914 int udic_large= FALSE; 1915 int udic_database= FALSE; 1916 int odic_expand = FALSE; 1917 int dic_saved = FALSE; 1918 int keyboard_in = TRUE; 1919 int alt_memb = 2; /* extra members of a poly-code to be recorded */ 1921 /* 1922 * find_gly using a StepCode to find the GB code for display a glyph. 1923 */ 1924 int find_gly(step, stepcd, infor, gb, key) 1925 char *step, *stepcd, *infor, *gb; 1926 int *key; 1927 { 1928 FILE *bufp; 1929 int linecnt, bytes; 1930 char line[MAXdatalen], *p; 1931 char bufname[FILENAMSIZ]; 1933 strncpy(stepcd, step, strlen(step)+1); 1934 if (hit_gly(stepcd, gb)) 1935 { *key=GB; return(A_to_B);} 1937 strncpy(bufname, BUFFILE, FILENAMSIZ); 1938 bufp = (FILE *)fopen(bufname, "w+b"); 1939 if( bufp == NULL ) 1940 { 1941 strcpy( message, "Buffer file unavailable."); 1942 typo(message, word); 1943 return(ERROR); 1944 } 1945 search_dic(STEP, 1, stepcd, bufname, &bufp, &linecnt); 1946 if (linecnt<=0) 1947 { 1948 if(verbose) 1949 typo("No entry found in GB table. You may create one.", step); 1951 fclose(bufp); 1952 return(A_to_ZIL); 1953 } 1954 fseek( bufp, 0L, 0 ); /* to beginning sake read */ 1955 if(fgets(line, MAXdatalen, bufp)== NULL) 1956 { if(verbose) 1957 fprintf(stderr, "ERROR- buffer file read error.\n"); 1958 fclose(bufp); 1959 return(ERROR); 1960 } 1961 sscanf(line, "%s%d%s%s\n", stepcd, key, gb, infor); 1962 hash_gly(stepcd, gb); 1963 fclose(bufp); 1964 if (linecnt>1) 1965 { 1966 return( A_to_N); 1967 }else { 1968 return( A_to_B); 1969 } 1970 } 1972 int find_wd(step, stepcd, infor, gb, cnt, key) 1973 char *step, *stepcd, *infor, *gb; 1974 int cnt, *key; 1975 { 1976 FILE *bufp; 1977 int linecnt; 1978 char line[MAXdatalen], *p; 1979 char bufname[FILENAMSIZ]; 1981 strncpy(stepcd, step, strlen(step)+1); 1982 if ( hit_wd(stepcd, gb)) 1983 { *key = GB; return(A_to_B);} 1985 strncpy(bufname, BUFFILE, FILENAMSIZ); 1986 bufp = (FILE *)fopen(bufname, "w+b"); 1987 if( bufp == NULL ) 1988 { 1989 fprintf( stderr, "Buffer file unavailable."); 1990 return(ERROR); 1991 } 1992 search_dic(STEP, cnt, stepcd, bufname, &bufp, &linecnt); 1993 if (linecnt<=0) 1994 { if (!auto_learn) 1995 { 1996 if(verbose) 1997 typo("Not found. You may create the word.", step); 1998 fclose(bufp); 1999 return(A_to_ZIL); 2000 }else 2001 { 2002 neww = learnword(cnt, stepcd, gb); 2003 /* Do whatever with neww here */ 2004 if(dic_saved) 2005 { 2006 hash_wd(stepcd, gb); 2007 dic_saved = FALSE; 2008 } 2009 else 2010 { 2011 typo("The new word has not saved.", stepcd); 2012 } 2013 fclose(bufp); 2014 neww = reset_word(neww); 2015 return(ZIL_to_A); 2016 } 2017 } 2018 fseek( bufp, 0L, 0 ); /* to beginning sake read */ 2019 fgets(line, MAXdatalen, bufp); 2020 if(line == NULL) 2021 { 2022 if (ferror(bufp)!=0 && verbose) 2023 fprintf(stderr, "Error during buffer read.\n"); 2024 if (feof(bufp) !=0 && verbose) 2025 fprintf(stderr, "Buffer file ended.\n"); 2026 clearerr(bufp); 2027 fclose(bufp); 2028 return(A_to_ZIL); 2029 } 2030 sscanf(line, "%s%d%s%s\n", stepcd, key, gb, infor); 2031 hash_wd(stepcd, gb); 2032 fclose(bufp); 2033 if (linecnt>1) 2034 { 2035 return( A_to_N); 2036 }else { 2037 return (A_to_B); 2038 } 2039 } 2041 /* -------------------------------------------------------------------- 2042 * Figure out the number of glyphs in a word. The next two routines are 2043 * based on PINYIN system. 2044 */ 2045 int one_letter_sound(word) 2046 char *word; 2047 { 2048 int cnt=0; 2049 char *w, *v; 2051 w=word; 2052 while (*w=='m'||*w=='M'||*w=='n'||*w=='N') 2053 { ++cnt; ++w;} 2054 if (cnt>0) 2055 { 2056 v = w; --v; 2057 if((*w=='g'||*w=='G')&& (*v=='n'||*v=='N')) 2058 ++w; /*ex: mng nnng*/ 2059 } 2060 if(cnt==0) while (*w=='a'||*w=='A'){ ++cnt; ++w;} 2061 if(cnt==0) while (*w=='o'||*w=='O'){ ++cnt; ++w;} 2062 if(cnt==0) while (*w=='e'||*w=='E'){ ++cnt; ++w;} 2063 if (!isalpha(*w)) 2064 return(cnt); /*ex:a aa ooo eee- mmm nmn*/ 2065 else cnt=0; /*ex: an hhh oong */ 2066 return(cnt); 2067 } 2069 int tell_word(word) 2070 char *word; 2071 { 2072 char *w, *v; 2073 int cnt; 2074 cnt=0; 2076 if(!isalpha(*word)) return (NULL); 2078 for (w=word;isalpha(*w);++w); /*skip Pinyin */ 2079 while (isdigit(*w)) {cnt++; ++w;} /*count the number of tone marks*/ 2081 if (cnt<1) /*special sigle letter glyph cases*/ 2082 { 2083 cnt = one_letter_sound(word); 2084 if (cnt>=1) return(cnt); /* else do syllable analysis */ 2085 } 2086 else return(cnt); 2088 /* 2089 * find the number of syllables by vowel rules 2090 * This implementation is accuate even without using apostrophe 2091 */ 2092 w=word; 2093 while (isalpha(*w)) /*check the Pinyin only*/ 2094 { 2095 switch (*w) 2096 { 2097 case 'a': 2098 case 'i': 2099 case 'e': 2100 case 'o': 2101 case 'u': v=w; ++w; cnt++; /*one vowel case*/ 2102 switch (*w) 2103 { 2104 case 'i': 2105 case 'e': 2106 case 'o': 2107 case 'u': ++w;break; /*two vowels sound*/ 2108 case 'a': ++w; 2109 if (*v=='u' && *w=='i') break;/*uai*/ 2110 if (*v=='i' && *w=='o') break;/*iao*/ 2112 else { 2113 --w; /*still two vowels*/ 2114 break; 2115 } 2116 default: break; 2117 } 2118 default: 2119 /*already get out off the compound vowel*/ 2120 break; 2121 } 2122 ++w; 2123 }/*check syllables*/ 2124 return(cnt); 2125 } 2127 /* 2128 * -------------------------------------------------------------------- 2129 * Interactive input process procedure 2130 * -------------------------------------------------------------------- 2131 */ 2132 inputp(char *word, char *gb) 2133 { 2134 int i, glyphcnt; 2135 char c, *w; 2136 int cnt, key, stat; 2137 char dump[MAXdatalen]; 2139 for (;;) 2140 { 2141 *word='\0'; 2142 fgets(word, MAXlinelen, stdin); 2143 if (isspace(*word)) 2144 break; 2146 /* Check if the entry is a glyph string by */ 2147 glyphcnt = tell_word(word); 2148 if (glyphcnt == NULL) 2149 { 2150 printf("%s", *word); 2151 fflush(stdin); 2152 continue; 2153 } 2155 w=word; 2156 while (isalnum(*w)) ++w; 2157 *w = '\0'; 2158 if(verbose) 2159 printf("tell_word figure: %d glyphs\n", glyphcnt); 2161 /* Determin the entry is known through dictionary 2162 * and cache lookup. 2163 */ 2164 if(glyphcnt >=2) 2165 stat = find_wd(word, stepcd, dump,gb,glyphcnt, &key); 2166 else stat = find_gly(word, stepcd, dump,gb, &key); 2168 /* Print out with GB code */ 2169 if (!stat==ERROR) font_code(stepcd, gb, &key, stderr); 2170 if(verbose) printf("%s\n", stepcd); 2171 fflush(stdin); 2172 fflush(stderr); 2173 } 2174 return(0); 2175 }