idnits 2.17.1 

draft-ietf-idn-step-01.txt:
-(264): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(286): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(289): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(300): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(319): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(375): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(422): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(453): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(482): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(535): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(548): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(553): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(554): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(558): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(561): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(562): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(595): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(596): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(748): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(761): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(762): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(764): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(767): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(863): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(864): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(872): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(883): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(886): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(887): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(944): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(964): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1076): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1085): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1211): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1212): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1213): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1217): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1253): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1288): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1290): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1314): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1388): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1581): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1582): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1690): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1732): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1738): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1817): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1826): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1865): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding
-(1871): Line appears to be too long, but this could be caused by non-ascii characters in UTF-8 encoding

  Checking boilerplate required by RFC 5378 and the IETF Trust (see
  https://trustee.ietf.org/license-info):
  ----------------------------------------------------------------------------

  ** Looks like you're using RFC 2026 boilerplate.  This must be updated to
     follow RFC 3978/3979, as updated by RFC 4748.


  Checking nits according to https://www.ietf.org/id-info/1id-guidelines.txt:
  ----------------------------------------------------------------------------

  ** The document seems to lack a 1id_guidelines paragraph about 6 months
     document validity -- however, there's a paragraph with a matching
     beginning. Boilerplate error?

  ** The document is more than 15 pages and seems to lack a Table of Contents.

  == There are 113 instances of lines with non-ascii characters in the
     document.

  == No 'Intended status' indicated for this document; assuming Proposed
     Standard

  == The page length should not exceed 58 lines per page, but there was 1
     longer page, the longest (page 1) being 2188 lines


  Checking nits according to https://www.ietf.org/id-info/checklist :
  ----------------------------------------------------------------------------

  ** The document seems to lack separate sections for Informative/Normative
     References.  All references will be assumed normative when checking for
     downward references.

  ** There are 68 instances of too long lines in the document, the longest
     one being 6 characters in excess of 72.

  ** There are 456 instances of lines with control characters in the document.

  ** The abstract seems to contain references ([ISO639]), which it shouldn't.
      Please replace those with straight textual mentions of the documents in
     question.


  Miscellaneous warnings:
  ----------------------------------------------------------------------------

  == Line 641 has weird spacing: '...Chinese    zh-...'

  == Line 643 has weird spacing: '...Deutsch    de-...'

  == Line 644 has weird spacing: '...English    en-...'

  == Line 645 has weird spacing: '...peranto  eo-...'

  == Line 649 has weird spacing: '...apanese   ja-...'

  == (14 more instances...)

  == The document seems to lack the recommended RFC 2119 boilerplate, even if
     it appears to use RFC 2119 keywords. 

     (The document does seem to have the reference to RFC 2119 which the
     ID-Checklist requires).
  -- The document seems to lack a disclaimer for pre-RFC5378 work, but may
     have content which was first submitted before 10 November 2008.  If you
     have contacted all the original authors and they are all willing to grant
     the BCP78 rights to the IETF Trust, then this is fine, and you can ignore
     this comment.  If not, you may need to add the pre-RFC5378 disclaimer. 
     (See the Legal Provisions document at
     https://trustee.ietf.org/license-info for more information.)

  -- Couldn't find a document date in the document -- date freshness check
     skipped.

  -- Found something which looks like a code comment -- if you have code
     sections in the document, please surround them with '<CODE BEGINS>' and
     '<CODE ENDS>' lines.


  Checking references for intended status: Proposed Standard
  ----------------------------------------------------------------------------

     (See RFCs 3967 and 4897 for information about using normative references
     to lower-maturity documents in RFCs)

  == Missing Reference: 'ISO 639' is mentioned on line 36, but not defined

  == Missing Reference: 'UCS' is mentioned on line 247, but not defined

  == Missing Reference: 'Uname' is mentioned on line 268, but not defined

  == Missing Reference: 'IDNmap' is mentioned on line 1724, but not defined

  == Missing Reference: 'Unicode3' is mentioned on line 361, but not defined

  == Missing Reference: 'Nameprep' is mentioned on line 1088, but not defined

  == Missing Reference: 'Stone' is mentioned on line 1263, but not defined

  == Missing Reference: 'A-Za-z0-9' is mentioned on line 463, but not defined

  == Missing Reference: 'T' is mentioned on line 491, but not defined

  == Missing Reference: 'P1' is mentioned on line 491, but not defined

  == Missing Reference: 'L1' is mentioned on line 491, but not defined

  == Missing Reference: 'P2' is mentioned on line 491, but not defined

  == Missing Reference: 'L2' is mentioned on line 491, but not defined

  == Missing Reference: 'Py' is mentioned on line 491, but not defined

  -- Looks like a reference, but probably isn't: '0' on line 521

  == Missing Reference: 'SLS' is mentioned on line 1250, but not defined

  == Missing Reference: 'RFC 2825' is mentioned on line 1266, but not defined

  == Missing Reference: 'R1' is mentioned on line 1404, but not defined

  == Missing Reference: 'R2' is mentioned on line 1433, but not defined

  == Missing Reference: 'R3' is mentioned on line 1558, but not defined

  == Missing Reference: 'R1-R3' is mentioned on line 1503, but not defined

  == Missing Reference: 'R4' is mentioned on line 1508, but not defined

  == Missing Reference: 'R1-R3-R4' is mentioned on line 1559, but not defined

  == Missing Reference: 'Rx-Ry-Rz-R4' is mentioned on line 1563, but not
     defined

  == Missing Reference: 'Appendix' is mentioned on line 1901, but not defined

  == Missing Reference: 'UAX 15' is mentioned on line 1665, but not defined

  == Missing Reference: 'Bidi' is mentioned on line 1665, but not defined

  == Missing Reference: 'MAXdatalen' is mentioned on line 2137, but not
     defined

  == Missing Reference: 'FILENAMSIZ' is mentioned on line 1979, but not
     defined

  == Unused Reference: 'ASCII' is defined on line 1781, but no explicit
     reference was found in the text

  == Unused Reference: 'Dictionary79' is defined on line 1791, but no
     explicit reference was found in the text

  == Unused Reference: 'IDNReq' is defined on line 1797, but no explicit
     reference was found in the text

  == Unused Reference: 'ISO639' is defined on line 1803, but no explicit
     reference was found in the text

  == Unused Reference: 'PinyinCon' is defined on line 1817, but no explicit
     reference was found in the text

  == Unused Reference: 'Macmillan93' is defined on line 1821, but no explicit
     reference was found in the text

  == Unused Reference: 'RFC 2026' is defined on line 1826, but no explicit
     reference was found in the text

  == Unused Reference: 'RFC2277' is defined on line 1832, but no explicit
     reference was found in the text

  == Unused Reference: 'RFC2396' is defined on line 1835, but no explicit
     reference was found in the text

  == Unused Reference: 'SIS' is defined on line 1841, but no explicit
     reference was found in the text

  == Unused Reference: 'RFC2825' is defined on line 1847, but no explicit
     reference was found in the text

  == Unused Reference: 'UAX15' is defined on line 1850, but no explicit
     reference was found in the text

  == Unused Reference: 'UNICODE' is defined on line 1854, but no explicit
     reference was found in the text

  == Unused Reference: 'UNICODE30' is defined on line 1857, but no explicit
     reference was found in the text

  == Unused Reference: 'Versions' is defined on line 1865, but no explicit
     reference was found in the text

  -- Possible downref: Non-RFC (?) normative reference: ref. 'ASCII'

  -- Possible downref: Normative reference to a draft: ref. 'CJK' 

  -- Possible downref: Non-RFC (?) normative reference: ref. 'DeFrancis 1989'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'Dictionary79'

  -- No information found for draft-ietf-idn-icdn - is the name correct?

  -- Possible downref: Normative reference to a draft: ref. 'Icdn' 

  -- Possible downref: Normative reference to a draft: ref. 'IDNReq' 

  -- Possible downref: Non-RFC (?) normative reference: ref. 'IPA'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'ISO639'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'ISO10646'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'Hindi 98'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'Translit 97'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'PinyinCon'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'Macmillan93'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'Mnemonics'

  ** Obsolete normative reference: RFC 2396 (Obsoleted by RFC 3986)

  -- Possible downref: Non-RFC (?) normative reference: ref. 'Russian 44'

  == Outdated reference: A later version (-02) exists of draft-mealling-sls-00

  -- Possible downref: Normative reference to a draft: ref. 'SIS' 

  ** Downref: Normative reference to an Informational RFC: RFC 2825

  -- Possible downref: Non-RFC (?) normative reference: ref. 'UAX15'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'UNICODE'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'UNICODE30'

  -- Duplicate reference: RFC2396, mentioned in 'URI', was also mentioned in
     'RFC2396'.

  ** Obsolete normative reference: RFC 2396 (ref. 'URI') (Obsoleted by RFC
     3986)

  -- Possible downref: Normative reference to a draft: ref. 'Versions' 

  -- Possible downref: Non-RFC (?) normative reference: ref. 'WIPO'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'WORLD 95'

  -- Possible downref: Non-RFC (?) normative reference: ref. 'Ye95'


     Summary: 10 errors (**), 0 flaws (~~), 54 warnings (==), 29 comments
     (--).

     Run idnits with the --verbose option for more detailed information about
     the items above.

--------------------------------------------------------------------------------

1	Internet Draft                                              Liana Ye
2	draft-ietf-idn-step-01.txt                                   Y&D ISG
3	Sept. 28, 2001
4	Obsoletes: draft-ietf-idn-step-01.txt
5	Expires in six months (March 2002)

7		 StepCode - A Mnemonic Internationalized Domain Name Encoding

9	Status of this memo

11	This document is an Internet-Draft and is in full conformance with
12	all provisions of Section 10 of RFC2026.

14	Internet-Drafts are working documents of the Internet Engineering
15	Task Force (IETF), its areas, and its working groups. Note that
16	other groups may also distribute working documents as
17	Internet-Drafts.

19	Internet-Drafts are draft documents valid for a maximum of six
20	months and may be updated, replaced, or obsolete by other documents
21	at any time. It is inappropriate to use Internet-Drafts as reference
22	material or to cite them other than as "work in progress."

24	     The list of current Internet-Drafts can be accessed at
25	     http://www.ietf.org/ietf/1id-abstracts.txt

27	     The list of Internet-Draft Shadow Directories can be accessed
28		 at http://www.ietf.org/shadow.html.

30	Abstract

32	This document describes an Internationalized Domain Name (IDN)
33	Encoding method with US-ASCII [a-z0-9] characters, preserving the
34	primary sound value of such names users want, and technically
35	feasible, linguistically demanding once mechanism to represent the
36	names of multi-scripts with language tags defined by [ISO 639] in
37	the required DNS way, such that the encoded names can be used as
38	valid domain name identifiers.

40	  Table of Contents
41	1. Introduction
42	  1.1 Context
43	  1.2 Issues
44	  1.3 Romanized Multi-language Representation
45	  1.4 StepCode Protocol to Represent Trade Names
46	  1.5 StepCode Features
47	  1.6 Disclaimer
48	  1.7 Terminology
49	  1.8 IDN summary
50	2. Host Name Transformation
51	  2.1 Syntax of StepCode
52	  2.2 Glyph Boundary Marks
53	  2.3 Encoding Steps
54	  2.4 Transliteration Schemes
55	  2.5 Alphabetic Script Transformation �C Mechanical Methods
56	  2.6 Consonant Script Transformation - Developmental Issues
57	  2.7 Character Script Transformation �C Feasibility
58	2.8 Mixed Script Transformation �C Implementing Japanese
59	3. Numerical Symbol Value Assignment
60	  3.1 Diacritic Marks
61	  3.2 Phoneme Table
62	  3.3 Overflowing
63	  3.4 Priority List
64	  3.5 Radical Layout Indicators
65	4. Language Specific Procedures
66	  4.1 IDN Input Normalization Procedures
67	  4.2 DNS Fitting Procedures
68	5. Embodiment of StepCode Protocol
69	...

71	Tables:
72	Table 1. Romanized Latin Letter Assignments
73	Table 2. Top four non-native languages used in the world
74	Table 3. Russian Transliteration Table
75	Table 4. Two methods to expend the Latin script
76	Table 5. IDN Hindi Section Map
77	Table 6. General Diacritics Mapping Table
78	Table 7. Example of Using Diacritics mapping (French)
79	Table 8. Example Phoneme Mapping (Subset of IPA)
80	Table 9. Example use of Overflowing mapping (Chinese)
81	Table 10. Example use of Priority Mapping (English)
82	Table 11. Glyph Layout Numeral Values

84	1. Introduction

86	Symbolic representation of a concept takes on many forms. It can be
87	encrypted to conceal from a human reader, it can be compressed for a
88	mechanical program reader, and it can be an icon for any spoken
89	language readers. For a domain name represents an entity as an
90	individual, a product, or an organization, it has to be readable for
91	human readers both in their native languages as well as for human
92	readers not in that native languages, in addition to a computer
93	program reader which only reads code points. To bridge the three types
94	of requirement, StepCode is proposed to transform a native symbol to
95	one or more universal ASCII symbols in a mechanical manner for a
96	mechanical program reader.

98	1.1 Context

100	Although world-wide desire to use characters other than plain ASCII
101	in hostnames is bubbling up and accelerating, ICANN has to take
102	a cautious approach on adopting an international domain name system,
103	for the fear of duplicated or confused new domain names. The challenge
104	of how to represent the names users want in the DNS in a way that is
105	clear, technically feasible, and unique is still an open issue.

107	1.2. Issues in Multilingual Representation of DNS Host Names

109	A basic technical issue regarding a name is sorting and searching
110	zone files or name servers of hostname identifiers containing
111	different written languages for potentially very large numbers of
112	users online, say 10% of the world's population. Hostname
113	identification could become a bottleneck for internet traffic if
114	sorting and searching has to be treated 1) in more than one set of
115	partially overlapping or mixed or possibly mixed symbolic
116	representations; and 2) mostly in compressed or semantically random
117	ordered zone files scattered around the globe, as in the Shared
118	Registration System (�SRS�) since 1999 installation.

120	Historically, Character-formed script such as CJK characters has
121	inherent sorting and indexing difficulties and is used to be an
122	intellectual activity just to use a dictionary. In fact, it has been
123	a primary problem in computer processing of Oriental languages since
124	the early development of computer industry.  After almost importunate
125	research and development in the past decades, the solution are all
126	based on some types of table search, and the nature of such a
127	processing has been well understood, and the techniques are ready to
128	be applied to very large character set, such as Universal Character
129	Set [UCS].

131	With the experiences we have obtained from Oriental languages
132	processing, and suppose that we have solved such an indexing problem
133	and have accommodated mixed scripts such as Japanese and Korean, and
134	IDN goes to a character-form based system, then it is foreseeable
135	that IDN system will have to support a text based DNS system as
136	well for a long time. After all, the DNS system is a historically
137	successful system. To throw such a system away is like asking
138	people to stop shopping at supermarkets and pick up their lettuce
139	on the Internet.  Then it is certain, that we have to deal with
140	two sets of domain name identifiers for a long time ahead.

142	The Romanized Pinyin, Jamo and On-kun systems for CJK character
143	indexing has provided a feasible but partial solution. The currently
144	used complete solution is to go through a software process of both
145	searching tables for possible matches (not exact-match DNS lookups)
146	and, where necessary, dialogue with the users, and arrive at strong
147	candidates for the glyph representation. If this character selection
148	process is organized in a similar way with book indexing system,
149	alphanumeral-digits-digits..., used a North American library, then
150	the indices can be codified using Latin alphabet.  The dream of a
151	complete Romanized character system will be reality, sorting and
152	searching international domain names with one set of symbolic
153	representation will be speedy, and exactly matched DNS lookups could
154	be a reality.

156	1.3. Romanized Multi-language Representation

158	Codifing a trade name representation process is not limited to
159	codify a particular ASCII Compatible Encoding method or a particular
160	code mapping from one code standard to another code standard in a
161	technical context.  It shall codify one set of symbols, or one
162	representation system, and a number of efficient paths to let the users
163	have some freedom to decide how to use the system to express their own
164	trade names in the Internet context.  Though this was the sprit
165	of ASCII standard, it is the time to set more specific paths on how
166	to use ASCII to represent different scripts of spoken languages, or to
167	codify such a representation process, so that the number of paths does
168	not head for combinatorial explosion, as it is the case in Chinese
169	character encoding methods and for Japanese input systems. This is
170	analogous to let students tread out a optimal path on campus before a
171	concrete walk is poured, and it is our time to codify the paths.

173	Representation system for trade names is due to be unified. In fact,
174	writing system unification has been seen with Arabic, Latin and
175	Chinese.  Many different spoken language groups use each of them.
176	According to [DeFrancis 1989], human scripts can be organized into
177	three groups for their phonetic characteristics:
178	1. Syllabic systems, for example, Chinese, Japanese, Maya and Yi;
179	2. Consonantal systems, such as Hebrew, Arabic and Indian languages;
180	and 3. Alphabetic systems, including Greek, Latin, Cyrillic,
181	Korean Hangul and English.  Alphabetic systems can be unified by
182	embedding some differences under the hat of mnemonic representation
183	of language symbols, so that the French 'u' is permitted to have a
184	different sound value from the English 'u'.

186	Mapping a consonantal system to an alphabet symbol set is, essentially
187	embedding some phonetic differences, using a Latin mnemonic hat.
188	Additionally, there is the question on how to represent the vowels
189	of the language. Turkey has provided an answer to this question, and
190	Library of Congress has implemented extensive set of languages using
191	the same principle [Translit 97].

193	As to unifying a syllabic system with an alphabet system, two issues
194	need to be addressed.  The first is the inclusion of additional
195	character information which can not be expressed with an one-layer
196	type of a flat alphabet system.  The second issue is the reversibility
197	from the alphabetic system back to the syllabic system.

199	1.4. StepCode Protocol to Represent Trade Names

201	The proposed solution is called StepCode, for its staircase type
202	architecture in a transliteration procedure. First, it specifies the
203	phonetic differences to be embedded in the representation, where an
204	International Phonetic Alphabet [IPA] description of the embedded
205	differences shall be recorded.  Second, if the Romanized embedding
206	is not sufficient to cover the differences, such as tones,
207	suprasegmentals and diacritics, then extend the mapping space to a
208	26x10 table for secondary phonetic elements which can not be embedded
209	under the Latin mnemonic hat. Third, if the 26x10 space is not
210	sufficient, then linearize the symbol by specifying each of its
211	components. This last part may become recursive, or goes down for
212	more steps.

214	This open-ended procedure not only provides a path to unify a large
215	syllabic or character system with an alphabet symbol set, but also
216	ensures that more semantically specific symbols, such as trademarks
217	and logos, can be represented online and sorted for speedy referencing.
218	In addition, the solution tolerates different viewpoints of the same
219	glyph, such that a CJK character may be accessed by Mandarin Pinyin,
220	Cantonese Wade, or Japanese On-kun, Korean Hangul as well as users of
221	the same dialect creating different expressions in viewing the same
222	glyph.

224	StepCode protocol does not open doors for trade name chaos. First,
225	there are finitely many different scripts to support particular
226	dialects and expressions. Second, the protocol provides locally
227	available expressions for users to choose from, which also helps in
228	conforming expressions especially in IDN context. Third, although
229	the process allows users of the same dialect creating different
230	expressions in viewing a glyph, as it has been experienced with
231	over 600 variety of Chinese character encoding schemes in the past
232	three decades, it limits the different views of a glyph to a matrix
233	of one to ten cells on one fixed starting point [Ye95], where
234	variations in such a process become predictable and manageable.

236	Due to its step nature, the representation can (and should) stop
237	for each symbol, as soon as the symbol can be identified within
238	its designated context. For example, the following list of StepCodes
239	for four Chinese characters:

241	xin1qin1jin0        <new>
242	zhu2ge1ge0          <bamboo>
243	qing1shui1qing0     <clear>
244	hua2hua2shi0        <Chinese>

246	Each of these codes uniquely identify a CJK [CJK] character of a UCS
247	[UCS] code point in CJK section using Pinyin spelling. They all have
248	three parts: the first part is Pinyin spelling of the character; the
249	second part is the digit following the Pinyin. The digit indicates the
250	end of a character spelling and its tone mark. The two parts together
251	is the transliteration of a character. The remaining alphanumeral
252	string following the first digit is the third part of StepCode. They
253	are in the same format of character transliteration, and is the radical
254	part of transliteration.

256	When there is registration calls for the four characters, then the four
257	characters may be combined into one new alphanumeral string:
258	    "xinzhuqinghua1212qin1jin0ge1ge0shui1qing0hua2shi0".
259	The list of StepCodes for the above four characters is resulted from
260	two complete iterations of StepCode protocol.

262	Since it is enough for �xinzhuqinghua� to identify a well-known name
263	in DNS system, �xinzhuqinghua1212� for a not well-known name, and
264	"xinzhuqinghua1212qin1jin0ge1ge0� for pin-pointing a rarely known name,
265	it is up to the registrant and the a zone manager to register a DNS
266	identifier to be just right length for the user, and to keep the full
267	record for code reversal process, depends on IETF and ICANN decision to
268	support a duel-record system [Uname][IDNmap].

270	1.5 StepCode Features

272	The StepCode protocol is fully compatible with DNS specification, yet
273	is mnemonic, friendly multi-language accessible code points, and
274	accommodates mixed script use.

276	1.5.1 Multi-language access of the same UCS code point

278	Similar to the method used for searching books in a library, such that
279	CJK characters may be accessed by different language users. For
280	example, the following four Korean characters may be coded as:

282	U+????      sim0sim0         Hanja <ten>
283	U+2fa5      ni0ni0           Hanja <inside>
284	U+351a      to0t2o0          Hanhul <to>
285	U+3747      mot0m2o2t0        Hangul <mot>
286	 (Note 1: the transliteration is used in [Translit 97], where the �t�,
287	  in �mot� should be consistent to a jamo for a Korean sound value.
288	  Note 2: a hangul may not need to be treated as a CJK character. If
289	  it is the case, then �to� and �mot� MUST be unique within all hangul
290	  symbols.)

292	The two Hanja character are CJK code points used by at least three
293	languages, and Hangul is only used by Korean. When the four characters
294	combined into an DNS name, it takes the following form as its full name:

296	  simnitomot0000sim0ni0t2o0m2o2t0

298	so kr-simnitomot0000sim0ni0t2o0m2o2t0.com can be the DNS name or it
299	may be the full name record to be kept at local registrar and be
300	registered with DNS as �kr-simnitomot.com�. StepCode permits different
301	language tags to access the same glyph in [ISO10646].

303	1.5.2 Multi-script Accommodation

305	SeptCode protocol allows mixed scripts to co-exist.  For example,
306	the five Kana, diacritic mark and Kanji from Japanese:

308	U+3055      sa            <kana>
309	U+30fc      1	        <diacritic macron>
310	U+3073      bi            <kana>
311	U+3059      su            <kana>
312	U+????      gyo1go0       <Kanji business>
313	 (Note: Only one radical in a Kanji is coded, since the total number
314	  of Kanji is much smaller set than Han character set. Thus, one radical
315	  to be coded may be enough to guarantee a unique code within Kanji.)

317	Due to more complex decoding for Kanji than that of Kana, a delimiter
318	for the two seems needed, so a digit 0 may be required to end the kana
319	section. Thus the DNS name: �sa1bisu0gyo1go0� may be used. This shows,
320	that StepCode protocol can be adapted to many different mix of scripts,
321	and different languages needs different treatments on their scripts.

323	1.5.3 Fully Compatible with Current DNS

325	From the Chinese, Korean and Japanese example given above, the host
326	parts have no international glyphs but US-ASCII, and can be a valid
327	entry to DNS, and allows standard compression or security treatment
328	compatible with existing hostnames.

330	1.5.4 One Mnemonic System

332	It is one mnemonic system for any scripts in UCS, such that whatever
333	the language that the zone master understands, he can refer to, sort
334	on, and support of a registered IDN name.

336	1.6 Author's Disclaimer

338	This document is a guide for implementing mnemonic StepCode protocol
339	for IDN hostname identifiers in a language specific way. It is not
340	a natural language dictionary of any decree. The sound value
341	assignment of script symbol although balanced among several
342	considerations are not intended in anyway to claim any linguistics
343	expertise. The different scripts used by any one particular user
344	group addressed in the document does not dictate the user groups��
345	choice of any subsets of [ISO10646] symbols.

347	In addition, the document is bias on five issues:
348	1) The UCS symbol tabulation structure assumed is bias toward CJK users;
349	2) The mnemonic sound value is based on IPA classification;
350	3) The Latin letter value assignment is bias toward English usage;
351	4) The digits value assignment is bias toward Mandarin usage;
352	5) The language tag function is bias toward Indian languages.

354	1.7 Terminology

356	The key words "MUST", "SHALL", "REQUIRED", "SHOULD", "RECOMMENDED",
357	and "MAY" in this document are to be interpreted as described in
358	 [RFC2119].

360	Examples in this document use the notation from the Unicode Standard
361	[Unicode3] as well as the ISO 10646 names. For example, the letter
362	"a" may be represented as either "U+0061" or "LATIN SMALL LETTER A".

364	A non-Roman character also is denoted in its Romanized form and
365	followed by its English equivalent word in <>. For example, �zhong
366	<heavy>� without reference to Unicode, due to difficulties in pin down
367	all the code points used in this document from UCS table.

369	An IPA symbol is presented in [], while it is referred among text. For
370	example, [c] is for IPA sound value �c�, not Latin letter �c�.

372	StepCode assumes its encoding is language specific, each language as it
373	is defined in [ISO10646], has its mnemonic encoding and is a part of
374	ACE encoding prefixed to ASCII host name only, for example, �kr� for
375	Korean, �ja� for Japanese. The encoding is called �language tag� of a
376	DNS host name (for language tag implementation see [IDNmap] Section 3).
377	The DNS host name with such a language tag is called a "language tagged
378	ACE", or "T-ACE".

380	StepCode converts a list of internationalized characters at a client
381	site into a string of US-ASCII that are acceptable as a host name in
382	current DNS host naming usage. The former are called a list of �IDN
383	identifiers� or a "glyph" for a symbol represented by one code point
384	in [ISO10646] or "glyphs" for a string of glyphs and the post-converted
385	ASCII string is called a "DNS identifier".

387	[Nameprep] defines Unicode characters mappings, normalizing and
388	exclusions of internationalized host names. The characters from input
389	and in mapping and normalization list is called �IDN-label�, or IDN
390	input, which includes symbols mapping to null. IDN-label is a super set
391	of IDN identifiers in term of UCS code points.

393	The "IDN-label" at a client site may be represented by Unicode, GB code,
394	JIS code, BIG5 and others which may contain equivalent information.
395	These code forms are referred as language specific "localized code
396	points", or �local display codes�.

398	A large script such as CJK or UCS can be classified into three glyph
399	groups:
400	1) IDN letters: which can be directly mapped onto an alphanumeral
401	   symbol under the Latin mnemonic hat, for example, Bopomofo, Kana,
402	   Arabic, Bengali, Hebrew, Jamo, diacritics, etc.
403	2) IDN radicals: a minimum number of frequently used glyphs which are
404	   also used as radicals in other glyphs, and often has independent
405	   pronunciation, for example, U+2f00 to U+2fd5, U+2e80 to U+2ef3, and
406	   others scatted in CJK Plane 0 blocks;
407	3) IDN icons: the rest of the glyphs in the script, for example the
408	   majority code points of CJK, enclosed alphanumerices, enclosed CJK
409	   letters and ideographs.

411	The protocol uses US-ASCII to denote the phonetic elements of
412	a script and calls for standardizing such a mapping for each
413	language tag. The phonetic elements of a glyph is called "spelling"
414	of the glyph and is called "stem" for that of a radical.

416	StepCode procedure may have more than two complete iterations.
417	The first iteration is called �character transliteration� though
418	it may take in more linguistic defined elements in such a conversion
419	than a common term transliteration may imply.  The second iteration
420	is called �radical transliteration�, for it transcribes radicals of
421	a glyph.  The character to transliterated character table is called
422	�tagged section map� [IDNmap Sec. 2.2.3] or �tagged map�, while a
423	transliterated character is called a �StepCode�. The process of
424	converting an input string to T-ACE using a tagged map is called
425	�language tagged procedures� [IDNmap Sec. 4].

427	According to phonetic nature of world scripts, three groups are
428	referred: Alphabet systems, including Latin, Cyrillic and Greek,
429	Consonant systems, ie. Indian, Arabic languages), and Character
430	Systems, ie. CJK languages.

432	1.8 IDN summary

434	The StepCode is a language dictated flexible ACE protocol and it is
435	complement to the currently proposed, UCS flat treatment ACE. Its
436	coding process reflects �Crowd Control� concepts to better organize
437	character and symbols before they are applicable in IDN system. To
438	deliver it��s full potential and to be more effective, it needs more
439	consensus building among groups regarding code point treatment
440	[Stone], which would be arguable points even a flat UCS code point
441	treatment ACE is deployed alone in any case.

443	2. Host Name Transformation

445	According to [STD13], host parts must be case-insensitive, start
446	and end with a letter or digit, and contain only letters, digits,
447	and the hyphen character ("-"). This excludes any internationalized
448	characters, any font variations, case variations, character set
449	variations, as well as many other characters in the ASCII character
450	repertoire. Further, domain name parts must be 63 octets or shorter in
451	length including any language or other encoding tags.

453	User friendly encoding has to be coherent to users�� native languages,
454	and consequently, host name transformation is dependent to the language
455	tag [IDNmap Sec. 3] selected.  As a StepCode encoding guide, the
456	following discussion is focused on four different language groups:
457	Alphabet systems, Consonant systems, Character systems and mixed script
458	systems, from the simplest to more complex ones, and start with a
459	general description of StepCode syntax.

461	2.1 StepCode Syntax

463	A Stepcode unit is a string of [A-Za-z0-9] letters without any white
464	spaces, BLANK, in between. For each StepCode unit, there are data
465	elements indicated by "", which is a MUST supplied element, and []
466	where the element is optional, and / where the data is selectable.

468	Sx stands for primary sound value or spelling of xth glyph;
469	Tx stands for secondary sound value or tone of xth glyph;
470	Ry stands for Stem for yth radical;
471	Ly stands for Layout relation from radical y to y+1;
472	Rx.y stands for Stem for Xth glyph and its yth radical;
473	Lx.y stands for Layout relation from Xth glyph and its radical y to y+1.

475	2.1.1 One glyph

477	A code point or a glyph in UCS can be an IDN letter, an IDN radical or
478	an IDN icon. Where an IDN letter are phonetic symbols in its native
479	language context marked by a language tag. For example Kana are IDN
480	letters in Japanese context. An IDN radical is an independent glyph often
481	used as a component of another glyph, or a glyph in a foreign language
482	context. For example a simple Han character or a Han radical (U+2e90 �C
483	U+2ef3), a Greek letter in Chinese context. An IDN icon is a composite
484	glyph displayed in one display unit, normally a two dimensional square
485	area. The majority of CJK characters are IDN icons. IDN icons can be
486	viewed as compositions in terms of radicals, or IDN letters.

488	StepCode is language context sensitive transliteration of UCS code points. The
489	The following is formal definition and examples of StepCode for a glyph.
490	The minimum code for a StepCode is one ASCII letter:
491		"S"[T][P1][L1][P2][L2]...[Py][0/BLANK]

493	Thus, the following are examples of IDN letters, radicals and icons:
494	IDN letters:
495		A        a       <Latin capital letter A>
496		U+00c2   a6      <Latin letter A^>
497	      U+0a98   gha     <Gujarati letter gha>
498	      U+0a84   u1      <Gujarati letter uu>
499	IDN radicals:
500	      U+03b1   alf0            <Greek Small Letter Alfa>
501	      U+2f26 U+5b50   zi3z0    <CJK radical son>
502	      U+2f24 U+5937   da4d0    <CJK radical big>
503	      U+2f29 U+5b0f   xiao3x0  <CJK radical small>
504	      U+2f25 U+5973   nv3n0    <CJK radical female>
505	IDN icons:
506	      U+2639   :-(0    <White Frowning face>
507	      U+263a   :-)0    <White Smiling face>
508	           U+5b59   sun1zi1xiao0   <CJK character Grandson>
509	      U+597d   hao3nv1zi0     <CJK character good>
510	      U+5c16   jian1xiao2da0  <CJK character sharp>

512	Where the Unicode are IDN identifiers, the ASCII code column is
513	corresponding transliterated StepCode, or DNS identifiers and the
514	phonetic system used is in Chinese Pinyin.

516	2.1.2 Glyphs

518	A string of glyphs is considered as one unit with only alphanumeral:

520	"S1S2S3...Sx"[T1T2...Tx][P1.1][L1.1][P1.2][L1.2]...[P1.y][0]
521			[P2.1][L2.1][P2.2][L2.2]...[P2.y][0]
522				...
523			[Px.1][Lx.1][Px.2][Lx.2]...[Px.y][0/BLANK]

525	Example of glyphs:
526	Latin    AaA^a                  aaa6a
527	Gujarati U+0a98 U+0a84          gha + u1 -> ghu1
528	Chinese  U+597d U+5b0f U+5b50   haoxiaozi333nv1zi0x0z0

530	StepCodes are language specific. The above examples are from three
531	language groups with common mix of symbols from the same languages.
532	Where the Latin example has included capital letter A Circumflex, which
533	is mapped to digit 6.

535	Gujarati letter GHA has an implicit vowel �a�, due to transliteration
536	rule, when another vowel following the consonant the implicit vowel is
537	replaced.

539	Chinese phrase <good boy> in the above example shows a mix of IDN
540	radicals and icons encoding, where the first three digits indicate
541	three characters in the unit, and three radical transliterations
542	immediately follow.

544	2.2 Glyph Boundary Marks

546	Most script transliterations are mapped to alphabet system consistent
547	with consonant-vowel-terminal structure. The majority of �glyph to
548	glyph sequence� and �glyph sequence back to glyph� can be done with
549	minimum amount of linguistic rules embedded in glyph sequence
550	composing and decomposing procedures.

552	There are always exceptions to any rules in linguistics. For example,
553	the uses of �-� in Chinese and Korean, the uses of ���� in French and
554	Chinese, the uses of letters �ZWNJ� in Arabic, and the use of �|� in
555	Tibetan and Devangari to prevent two units to join, are complements
556	to the consonant-vowel-terminal rule.

558	In DNS system, only hyphen �-� is allowed for this purpose, and there
559	may be more than one levels of disjoints a host name of a script has to
560	differentiate. It is RECOMMENDED to consider an unused or non-conflict
561	letter first before the �-� has to be used in the transliteration of a
562	language tagged script. For example, the ���� in Chinese Pinyin may be
563	mapped to the letter �v� instead of a hyphen �-�.

565	2.3 Encoding Steps

567	StepCode starts at a phonetic representation of a glyph with ASCII
568	letters and a digit when it in need. This character transliteration
569	has two phases as in Sec. 2.1.1 IDN letter examples:
570	S1.1. Romanize the primary phonetic characteristic of a
571		glyph/phrase;
572	S1.2. Supplement the secondary phonetic characteristic of the
573		glyph with a digit/digits.

575	The second step of StepCode is applied to components of each glyph,
576	radical transliteration, in the same way specified in S1.1, and shown
577	in Sec. 2.1.1 IDN icon examples.
578	S2.1. Romanize the primary phonetic characteristic of a radical, B;
579	S2.2. Specify how the next radical is related to the current
580		radical, B, with a digit;
581	S2.3. If the radical contains another radical, X of B,
582		then go to S2.1 of X (and it is S2+1.1(X));
583		otherwise go to the next radical, B+1.

585	2.4 Transliteration Schemes

587	Language is creation of human thoughts, which wanders everywhere
588	disregard boundary. StepCode above is a rigid passageway, which only let
589	the properly formed traffic to go through. While an alphabetic script
590	structurally appears closest to Latin alphabet, a few general issues are
591	common to all transliterations. The first issue is which transliteration
592	should be implemented. Unicode Consortium has given each symbol a Latin
593	name for ease in reference. Such a name contains the main sound value of
594	the symbol, but usually more than what is needed in a transliteration.
595	For example, Cyrillic letter BE has sound value �b� in Latin, and it is
596	transliterated in [Translit 97] as a �b�. This introduces transliteration
597	modification #1 to Unicode, that the sound value of a glyph MAY be
598	extracted from its Latin name from UCS standard.

600	2.4.1 Basic Phonetic Classifications

602	It is RECOMMENDED that when consulting publications on character
603	transliteration, the IPA [IPA] definition SHOULD be the primary classes
604	to be considered. IPA class is an artificial grid over an analog
605	spectrum. For each class there is a focus sound with a Latin letter
606	label, and its neighboring sound values slide into its neighboring
607	sound classes. It has the best classification on human language sound
608	values available and its focus sounds are labeled with Latin alphabet
609	letters. [Translit 97] has provided 54 romanization and transliteration
610	schemes, and SHOULD be one of the base transliteration document.

612	2.4.2 Fuzzy Sound Value to Base Class Mapping

614	Whence a sound value can be described with an IPA class, then a
615	proximate letter representation can be referred. Transliteration
616	Modification #2 is to consider a letter assignment in term of IPA class.
617	It is RECOMEMDED that when alphabet is used to represent a sound value
618	in a script, a balance between the current use of a letter in the same
619	script and common uses of the same letter in other languages shall be
620	found. The following is a comparison table of fricative alveolar-palatal
621	letter sound assignments of a group of sampled languages. The table is
622	expended a little into Plosives, Post-Palatals and Approximants for
623	different sound value comparison with Arabic, Hindi, Vietnamese and
624	Chinese languages, and also is used as illustration of the nature of IPA
625	classification.

627	Table header are IPA category represented as:
628	Alveolar           Alveo
629	Postalveolar       Postalv
630	Retroflex          Retrof
631	Alveolar-Palatal   Alv-Pal
632	Front Palatal      FrontP
633	Palatal            Pala

635	Plosive            Plos
636	Affricative        Affr
637	Fricative          Fric
638	Approximant        Approx

640	Languages tagged as:
641	Chinese    zh-
642	Arabic     ar-
643	Deutsch    de-
644	English    en-
645	Esperanto  eo-
646	French     fr-
647	Latin      la-
648	Hebew      he-
649	Japanese   ja-
650	Korean     ko-
651	Hindi      hi-
652	Lao        lo-
653	Russian    ru-
654	Spanish    es-
655	Serbo      sr-
656	Tamil      ta-
657	Urdu       ur-
658	Vietnamese vi-

660	The IPA symbol entries:
661	U+0283  sh       Latin Letter esh
662	U+0292  zh       Latin Letter yogh
663	U+0282  s2	 Latin Letter s hook
664	U+0290  z2       Latin Letter z Retroflex hook
665	U+0255  c3	Latin Letter c curl
666	U+0291  z3     Latin Letter z curl
667	U+029d  j1	Latin Letter crossed-tail j
668	c U+0327  c1      Latin Letter c cedilla

670	       Alveo     Postalv     Retrof     Alv-Pal    FrontP   Pala
671	      --------   --------    --------   --------   -------  -----
672	Plos   t  d                  t2  d2                         c
673	      --------   --------    --------   --------   -------  -----
674	                            ar-T ar-D
675	                            he-T
676	      hi-t  hi-d            hi-T  hi-D
677	      hi-th hi-dh           hi-Th hi-Dh                          hi-kh hi-gh
678	                                                           sr-c
679	                                                           vi-ch vi-c
680	                            ur-T
681	      --------   --------    --------   --------   -------  -----
682	Affr  ts  dz     tsh  dzh    ts2  dz2   tc3  dz3   tc1  dj1
683	      --------   --------    --------   --------   -------  -----
684	      zh-z      en-ch en-j  zh-zh       zh-j
685	      zh-c      ar-ch       zh-ch       zh-q
686	                de-ch
687	                eo-cx eo-gx
688	      he-ts     he-ch
689	      ja-ts     ja-ch
690	                ko-ch ko-tch/jj                                     ko-gg
691	                hi-c
692	                hi-ch
693	                lo-ch
694	                es-ch
695	          sr-dz sr-ch           sr-dz2
696	      sr-ts��
697	      ru-ts    ru-ch ru-zh
698	          ur-z              ur-zh
699	      --------   --------    --------   --------   -------  -----
700	Fric    s   z     sh  zh      s2  z2     c3   z3   c1   j1
701	      --------   --------    --------   --------   -------  -----
702	      zh-s      en-sh en-as zh-sh zh-r  zh-x
703	      ar-s ar-z ar-sh ar-zh ar-S  ar-Z                           ar-H
704	      de-s de-z de-sch
705	      eo-s eo-z eo-sx eo-jx
706	      fr-s fr-z fr-sh fr-je
707	      he-s he-z he-sh
708	      ja-s ja-z ja-sh ja-j
709	      ko-s ko-ss      ko-j
710	      hi-s      hi-sh       hi-S
711	      lo-s
712	      es-s/c                es-z
713	      sr-s sr-z sr-sh sr-zh
714	      ru-c      ru-sh
715	      vi-x vi-d vi-s
716	      ur-s ur-z ur-sh
717	      --------   --------    --------   --------   -------  -----
718	Approx                                                        j
719	      --------   --------    --------   --------   -------  -----
720	      hi-r                                                  hi-j
721	      hi-l ta-l              ta-L                             ta-l2 hi-jh
722	      ta-r                   ta-N
723	                             ur-R
724	      --------   --------    --------   --------   -------  -----
725	       Alveo     Postalv     Retrof     Alv-Pal    FrontP   Pala
726	      --------   --------    --------   --------   -------  -----
727	Table 1. Romanized Latin letter assignments found in contemporary text
728	books, bilingual dictionaries and [Translit 97].

730	More notes on table entries:
731	 The entries under column headers are in unvoiced vs. voiced pairs.
732	 The entries of the same column with a same language tag are non-aspirated
733	   and aspirated pairs in two rows, for example:
734	 	      hi-c
735	          hi-ch
736	 The uppercase letter assignments are taken from certain text books,
737	   where the transliteration takes several forms: doubling letters (common
738	   in text books), a dot under a letter(Library of Congress) and a
739	   capital letter (IPA convention).

741	Particular languages often have several sounds falling into the same
742	class, or under the neighboring classes of IPA table, but very few under
743	other labels. This phenomenon is can be found in above, Table 1. It is
744	RECOMMENDED to follow conventional use of neighboring labels to
745	differentiate the value concentrated classes, provided it does not
746	conflict with other sound values which are already stable assignments.
747	Some language transliterations supplementing a secondary letter to the
748	label in focus often achieve satisfactory results, for example �ja-sh�.

750	From the tabulated 18 language transliterations in Table 1, and
751	considering the conventional transliteration practice shown in the table,
752	the following sound value convention is RECOMMENDED:

754	Doubling vowel for a long vowel sound, (mostly used in Arabic)
755	Doubling consonant for sound produced from back position (Arabic, Hindi)
756	sh    for U+0283, Latin Letter esh (All in the table)
757	j     for U+0292, Latin Letter yogh (most in the table)
758	zh    for dj/dz   as an alternative for conventional dj and dz, it appears
759	                  quite popular in non-Roman languages.
760	ch     t U+0283  (Almost all in the table have done so.)
761	c      c/ts      (Though existing TS is common, but a ��c�� is a clear favor
762	                  for simplicity, provided that [c] is covered under ��k��.)
763	h      as an attachment letter for aspirated sound (as in Hindi).
764	n      for nasalization, it is hard to separated from [n], as ��n-��, so a
765	       diacritic is RECOMMENDED.)
766	k      for [c],[k],[q] (It is rare to differentiate all the three in a
767	        language. When it has such a need, a ��kk�� accomplishes the task
768	        as it��s in Korean.)

770	Since most of the transliteration data of Table 1 is from English
771	literature, the recommendation above clearly is bias toward English
772	speakers. The bias is based on two reasons. The first is technical, that
773	common English does not use diacritical marks, so that it is a better
774	base scheme for adapting other language symbols which often use
775	diacritics. The second reason is the fact shown, in Table 2, that English
776	is the highest in number of population, as non-native language used in
777	the world currently.

779	            The principle languages of the world �C

781	 Source: S. Culbert, NI-25, University of Washington, Seattle,
782		WA 98195, USA; Data as of mid-1993 [WORLD 95]
783	 Languages spoken by more than 100,000,000 people:

785				Native   Non-native	Total
786		Mandarin - 	  836	      126		952
787		Hindi -	  333			      418
788		Spanish -	  332			      381
789		English -	  322	      148		470
790		Bengali -	  189			      196
791		Arabic -	  186			      219
792	 	Russian -	  170	      118		288
793		Portuguese -  170			      182
794		Japanese -	  125			      126
795		German -	  98			      121
796		French -	  72			      124
797		Malay-Indonesian - 50	105		155

799	Table 2. Top four non-native languages used in the world: English,
800	  Mandarin, Russian and Malay-Indonesian.

802	2.5 Alphabetic Script Transformation �C Mechanical Methods

804	Transliteration is mostly table lookups with minimum rules to implement.
805	Although alphabetic script transliteration is simplest, it is the place
806	to specify transliteration table format and a few basic concepts and
807	basic decision points in StepCode implementation, such as which phonetic
808	system shall be selected, which foreign symbol set to be included in a
809	language tagged script range [IDNmap] and how to include a foreign
810	symbol or a symbol set.

812	2.5.1 Transliteration Tables

814	Transliteration table usually contains two columns.  To make referencing
815	easy for a layman, it is RECOMMENDED that transliteration tables contains
816	at least four columns: ASCII symbol, UCS glyph, IPA sound value, and
817	examples of spoken words of the language as shown in Table 3, with
818	necessary comments.

820	ASCII  UCS           IPA              Example
821	ru-
822	a	U+0430		U+0251 :         matb
823	b	U+0431		b                co6aka
824	v	U+0432		v
825	g	U+0433		g
826	d	U+0434		d
827	e	U+0435		e
828	j	U+0436		U+02a4
829	z	U+0437		z
830	i	U+0438		i:
831	y	U+0439		i
832	k	U+043a		k
833	l	U+043b		l
834	m	U+043c		m
835	n	U+043d		n
836	o	U+043e		U+0259
837	p	U+043f		p

839	r	U+0440		r
840	c	U+0441		s       (��s�� in [Translit 97])
841	t	U+0442		t
842	w	U+0443		u:
843	f	U+0444		f
844	x	U+0445		x       (��kh�� in [Translit 97])
845	ts	U+0446		ts
846	ch	U+0447		U+02a7
847	sh	U+0448		U+0283
848	sch	U+0449		U+0283 U+02a7  (��shch�� in [Translit 97])
849	q	U+044a		(slilent)
850	h	U+044b		U+0263
851	q	U+044c		(soften the last consonant)
852	a	U+044d		U+00e6
853	iu	U+044e		ju:
854	ia	U+044f		j U+0251 :

856	Table 3. Russian Transliteration Table.

858	The third and forth columns are convenient references to phonetic data
859	threads online.

861	Structurally, alphabetic script is similar with Latin, where some letters
862	may represent different sound with Latin letter. For example, in Table 3
863	[Russian 44] the letter ��x�� and ��c�� are kept as Cyrillic letter, but in
864	[Translit 97] they are transliterated to ��kh�� and ��s�� respectively. Since
865	the letters used here do not present conflict assignment with other
866	letters, it is in the best interests of the native speakers to decide
867	which version shall be used as DNS identifiers.

869	2.5.2 Mixed used of Alphabetical scripts

871	The major alphabetical scripts are Latin, Greek and Cyrillic, with very
872	few cases using symbols from another script, for example �AGAPE� is Greek
873	in Latin script, not in Greek script.  It is RECOMMENDED to have three
874	languages tags: la-, el- and ru- for Latin, Greek and Cyrillic, as three
875	respective primary language tags [IDNmap] for alphabetic scripts.

877	If an English user wants to include a symbol from Greek, he has to wait
878	for Latin tag to include Greek code block as its second script, if there
879	is enough demand for such a service. In this case, there are two methods
880	to include the transliteration table for Greek symbols in Latin tag.

882	The first one is to use a digit to indicate the second script set, as in
883	column 1 of Table 4, and is called �Overflow Symbol Mapping�(Section 3.3),
884	for simplicity in mechanical filling with a second set of symbols.

886	The second method is called �Radical mapping� is shown in column 2 of
887	Table 4. The name �radical� for Greek symbol is an analogy to radicals in
888	CJK, for a Greek letter has a sound and a name and can not be decomposed.
889	That is it is not a composite glyph, nor can it be sub-divided. They are
890	treated in the similar way with CJK character set in a foreign language.

892	A secondary script attached to Latin language tagged section map:

894	la-
895	a	U+0061
896	b      U+0062
897	...
898	z      U+007a

900	a9     alf0     U+03b1
901	b9     bet0     U+03b2
902	c9     gam0     U+03b3
903	d9     del0     U+03b4
904	e9     eps0     U+03b5
905	f9     zet0     U+03b6
906	g9     eta0     U+03b7
907	h9     the0     U+03b8
908	i9     iot0     U+03b9
909	j9     kap0     U+03ba
910	k9     lam0     U+03bb
911	l9     mu0      U+03bc
912	m9     nu0      U+03bd
913	n9     xi0      U+03be
914	o9     omi0     U+03bf
915	p9     pi0      U+03c0
916	q9     pho0     U+03c1
917	r9     fsi0     U+03c2
918	s9     sig0     U+03c3
919	t9     tau0     U+03c4
920	u9     ups0     U+03c5
921	v9     phi0     U+03c6
922	w9     chi0     U+03c7
923	x9     psi0     U+03c8
924	y9     ome0     U+03c9

926	Table 4. Two methods to expend the Latin script.

928	The pros for Column 1 is short and regular, provided the digit 9 is not
929	assigned to something else. The cons is hard to remember which letter
930	of Greek is in that Latin letter position.

932	The second method shown in Column 2 is easy to remember since a Greek
933	letter is mostly spelled out in a syllable ( and can be mapped according
934	to its sound value instead of the mechanical flooding as they are in
935	Table 4), but is harder for a program to tell the character boundary.
936	The few options are available for amending the radical mapping
937	implementation:
938	1) Filling the short name up to make all the Greek symbols with uniform
939	  length, say 3 letters. By recognizing digit 0, the decomposing procedure
940	  can take preceding 3 letters as one symbol, this is called Protocol
941	  method.
942	2) Insert another digit 0 before the Greek symbol to mark a foreign
943	  symbol, and is called Marker method.
944	3) Insert a hyphen ��-�� before the Greek symbol, to make an independent
945	  sub-name unit, and is also a Marker method.

947	The pros for the above IDN radical symbol treatment is it is flexible, in
948	terms of the number of symbols to be introduced, and in terms of naming
949	such a symbol that a native reader understand, also it can be used for
950	trademark encoding when there is such a request. The cons for it is
951	lacking market data to support such an implementation.  It is RECOMEMMDED
952	a radical mapping is selected for introduce foreign symbols into a
953	language tag.

955	Assuming the above recommendation is accepted, it is RECOMMENDED to use
956	Method 2) to mark a foreign symbol in a language tag, for it accommodates
957	variable length description of a foreign symbol, it is consistent with CJK
958	symbol treatment discussed in Section 2.7 and it preserves method 3) for
959	users to make individual decisions on their naming.

961	2.6 Consonant Script Transformation �C Developmental Issues

963	The name for this group of scripts may not be accurate, it just as
964	well be called as the �rest of scripts� besides Euro and Han scripts. The
965	main concern in treating this group of scripts is treating each script
966	independently and not let any rules made now develop into extreme in a
967	near future. For example, one extreme is to forbid any new symbols to
968	enter a language tagged range, the other is open up the whole UCS for one
969	language tag.  The Hindi language section map is selected here to examine
970	implementation issues, since it reflects some of the reality in that user
971	sector as well as in the engineering sector regarding language tag design
972	issues [Stone].

974	hin-

976	7	U+0901		(nasalization)
977		U+0902         (no decision)
978		U+0903         (no decision)

980	a	U+0905		U+028c
981	aa	U+0906		U+0251 :
982	i	U+0907		I
983	ii	U+0908		i:
984	u	U+0909		U+028a
985	uu	U+090a		u:
986	ri	U+090b		ri
987	lri	U+090c		lri
988	e	U+090d		e
989	e	U+090e		e
990	e	U+090f		e
991	ai	U+0910		U+00e6/aI
992	o	U+0911		U+0259 U+028a
993	o	U+0912		U+0259 U+028a
994	o	U+0913		U+0259 U+028a
995	au	U+0914		U+0254 : / a U+028a

997	k	U+0915		k
998	kh	U+0916		x
999	g	U+0917		g
1000	gh	U+0918		g'
1001	ng	U+0919		U+014b
1002	c	U+091a		U+02a7
1003	ch	U+091b		U+02a7 '
1004	j	U+091c		j
1005	jh	U+091d		j'
1006	ny	U+091e		ni
1007	tt/T	U+091f		U+0288

1009	tth	U+0920		U+0288'
1010	dd	U+0921		U+0256
1011	ddh	U+0922		U+0256'
1012	nd	U+0923		nd
1013	t	U+0924		t
1014	th	U+0925		t'
1015	d	U+0926		d
1016	dh	U+0927		d'
1017	n	U+0928		n
1018	nn	U+0929		n 	(for Tamil n)
1019	p	U+092a		p
1020	ph	U+092b		p'
1021	b	U+092c		b
1022	bh	U+092d		b'
1023	m	U+092e		m
1024	y	U+092f		y

1026	r	U+0930		r
1027	rr	U+0931		r  	(for Tamil r)
1028	l	U+0932		l
1029	ld	U+0933		ld
1030	ll	U+0934		l      (for Tamil l)
1031	v	U+0935		v
1032	sh	U+0936		U+0283
1033	ss	U+0937		U+0282
1034	s	U+0938		s
1035	h	U+0939		h

1037	q	U+0958		q
1038	khh	U+0959		q'
1039	ghh	U+095a		G'
1040	z	U+095b		z
1041	dddh	U+095c		U+0256 d'
1042	rh	U+095d		U+0280
1043	f	U+095e		f
1044	yy	U+095f		y:

1046		U+093a
1047		U+093b
1048		U+093c
1049		U+093d
1050	aa	U+093e		U+0251 :
1051	i	U+093f		I
1052	ii	U+0940		i:
1053	u	U+0941		U+028a
1054	uu	U+0942		u:
1055	ri	U+0943		rI
1056	rii	U+0944		ri:
1057	e	U+0945		e
1058	e	U+0946		e
1059	e	U+0947		e
1060	ai	U+0948		U+00e6 / aI
1061	o	U+0949		U+0259 U+028a
1062	o	U+094a		U+0259 U+028a
1063	o	U+094b		U+0259 U+028a
1064	au	U+094c		U+0254 : / a U+028a

1066	Table 5. IDN Hindi section Map [Hindi 98].

1068	Observations of Table 5:
1069	1) It has no example word column;
1070	2) It has not made decisions on several code points;
1071	3) It has adopted three Tamil symbols;
1072	4) the extra long vowel sound is indicated by doubling the vowel letter;
1073	5) the retroflex sound is indicated by doubling the consonant letter,
1074	   while other forms exist, such as uppercase letter or an under letter
1075	   mark as they are shown in Table 1 and [Translit 97];
1076	6) the aspirated sound is indicated by letter ��h�� instead of an apostrophe
1077	     ���� used in [IPA];
1078	7) the symbol transliteration is not mechanical mapping, it needs
1079	   linguistic rules to composing and decomposing a transliterated Latin
1080	   string for Hindi.
1081	8) the nasalizing sign, Devangari Sign Candrabindu, is mapped to digit 7,
1082	  since it is the last diacritical mark used in [Translit 97]. The
1083	  under-letter marks either have been reflected in Table 5, or ignored
1084	  due to implicit transliteration of Table 5;
1085	9) the section �U+093e - U+094c� are equivalent to section �U+0905 �C
1086	   U+0914�, the section of symbols are not treated separately in
1087	   [Translit 97]. These symbols could be included in canonicalizing
1088	   procedure specified in [Nameprep] but dependent to input code
1089	   processing.

1091	Each of the observations flags a developmental issue:
1092	1) Concerning the IDN as a long term solution or a short term fix. If this is
1093	  a long term solution, then to fill up the column will benefit long term
1094	  reference, there is no need to revisit the same issue when the reference
1095	  is organized for later comers.
1096	2) The assignment of 10 digits has to consider its common meaning to
1097	  other languages so that, there is conformity semantics for less confused
1098	  implementation and long term use.
1099	3) Implies that Tamil language often appears among Hindi speakers. It is
1100	  RECOMEMMDED to consider inclusion of one to two other scripts for each
1101	  of languages in Consonant language group in the future IDN releases.
1102	4), 5) and 6) are differences with [Translit 97] implementation. Advantages
1103	 of this implementation is not over-load diacritical marks and is more
1104	 reader friendly, with easier linguistic interpretation. Disadvantage is
1105	 using variable length of Latin letters for each Hindi symbol.
1106	7) As result of 4) 5) and 6), more linguistic understanding is required
1107	 in implementation of a language tagged procedures.
1108	8) With the more reader friendly treatment of Devanagari shown in 4)-7),
1109	 there are enough digits to be used for other aspects of the linguistic
1110	 issues, such as boundary, nasal, tonal or stress marks.
1111	9) Case mapping is a common issue, which can be applied equally to
1112	 Latin, Chinese, Japanese, Hindi as well as whatever there are such
1113	 requests, and which have been defined by their primary users. In any
1114	 case, the Hindi case mapping requires a better understanding of how the
1115	 symbols are used at the user end both from keyboard, as well as keyboard
1116	 signal to text transformation and local code exchange standard. When
1117	 such an expertise is not available, there is still no base for exclusion
1118	 for such a case mapping in IDN.

1120	2.7 Character Script Transformation �C Feasibility

1122	The commonly used symbol set for Chinese, Japanese and Korean is around
1123	4000 characters each, with some differences in forms, while majority of
1124	the symbols in each set over lap with the other two. Access of the 4,000
1125	characters is a headache if one has to select from a table of 4,000
1126	character without some efficient indexing system. For UCS CJK character
1127	set, the issue is to address over 21,003 characters using one primary
1128	language tag.

1130	For languages with a large number of glyphs, such as CJK set and is
1131	impossible to map onto a Latin alphabet directly, a three layered scheme
1132	is RECOMMENDED, and a minimum set of glyphs of a script which are often
1133	used as parts of other glyphs are CJK radicals SHOULD be derived.

1135	In the IDN system, the IDN letters include Bopomofo, Kana, and Jamo
1136	phonetic symbol sets.  Since these systems all have been used, has stable
1137	transliterations standards to refer to, and have been discussed in
1138	previous sections, in this section the discussion will be focused on
1139	radical transliteration.

1141	2.7.1 Character transliteration Scheme for IDN Radicals

1143	Radical are building blocks of CJK character set. Radicals are independent
1144	symbols with semantics and pronunciation or names. For example,

1146	     Unicode  Short form   Long form
1147	      U+03b1     alfa0               <Greek Small Letter Alfa>
1148	      U+5b50       zi0       zi3z0
1149	      U+5937       da0       da4d0
1150	      U+5b0f     xiao0     xiao3x0
1151	      U+5973       nv0       nv3n0

1153	are five radicals, where the first part of each code is the name of the
1154	radical, the second part as they are shown in the last column is its
1155	primary sub-radical name letter. Mandarin has 417 sounds with average 4
1156	tones each, total covers basic radical set of 1,500.  With 25 letters
1157	before the delimiter 0, theoretically it is enough to give 23,000 UCS
1158	characters unique index. However, it is not enough to give each character
1159	a unique mnemonic name to facilitate users�� access.

1161	With the fast expansion of memory chips and transmission speed in the last
1162	10 Years, vast amount of data can be stored at any local chips for fast
1163	references. It is doubtful to design an index system concurs to above
1164	theory is wise. Instead, user friendly configuration should have the
1165	highest priority, and a complete set of data at ease of access shall be
1166	the base for a new IDN design philosophy.

1168	Considering the radical encoding above, although it is enough to have
1169	Pinyin with tone indicator as its transliteration, as zi3, xiao3, da4,
1170	and nv3, it creates a different coding format, such that when they are
1171	mixed with an IDN icon, two different formats require more rules in
1172	processing. For simplicity, IDN radicals takes the same StepCode format
1173	as IDN icons, as shown in the last column on above four examples, which
1174	all end with a digit 0 as delimiter, but include only one letter as
1175	their sub-radical encoding to indicate a simple character with no further
1176	decomposing.

1178	Thus, the longer form of IDN radical transliteration applies when 1) the
1179	radical set is large within a language tag, and the diacritical marks
1180	play a part in the transliteration; 2) the radicals are used with large
1181	IDN icon set, such as CJK, a uniform format with the larger set is
1182	Preferred over code complexity, so the radical is treated as an IDN icon.

1184	The short form of IDN radical transliteration applies, when 1) the radicals
1185	are small set of foreign symbols under a concerned language tag, 2) a
1186	radical is used as radical transliteration of an IDN icon transliteration,
1187	as radical �xiao0� in Han character Sharp, �jian1xiao1da0�.

1189	2.7.2 Radical Naming Convention

1191	Some glyphs in the IDN radical set are most frequently used glyphs by
1192	themselves, some are used by themselves only in a particular language,
1193	yet some of them never stand alone, and their names follow naming
1194	convention which is listed bellow:

1196	"pang" - a radical on the left, �p� for short;
1197	"bian" - a radical on the right, �b� for short;
1198	"tou"  - a radical on the top, �t� for short;
1199	"di"   - a radical on the bottom, �d� for short;
1200	"xin"  - a radical in the middle, �x� for short;
1201	"kuang"- a container or an enclosure radical, �k� for short.

1203	Since CJK characters are written from left to right and top-down,
1204	often the "pang" is the first radical of a character to be used as the
1205	key for searching into dictionaries and is partially listed in UNICODE,
1206	so "pang" has the most number of them appear in an index table in a
1207	regular Han dictionary.

1209	2.7.3 CJK Character Coding Process

1211	CJK Character coding process reflects �Crowd Control� concepts: 1)survey
1212	Requests �C sorting, 2) select leaders �C identify equivalent cases, 3)
1213	mark directions �C mnemonic encoding, and 4) divert traffic �C leave out
1214	individual issues out for other applications. The principle applies to
1215	other UCS symbol transliteration encoding processes as well.

1217	The naming process SHOULD reflect a user��s viewpoint, not a programmer��s
1218	viewpoint. The following radical transliteration procedure is RECOMMENDED:
1219	1) Sort all the characters, include IDN icons, by Romanized names, which
1220	  is Pinyin for Chinese, or a Latin symbol name in UCS;
1221	2) Delete all polyphones of a character but leave one as the IDN
1222	  identifier;
1223	3) Sort all the homophones by frequency of usage counting both as a
1224	  radical and as an IDN icon, and obtain a sorted list on frequency of
1225	  usage, for example:
1226	     fei-20 fei-8 fei-3 fei-2 fei-1
1227	4) Move the hard to decompose character to the front, and suppose fei1-3
1228	  is such a character, then
1229	     fei1-20 fei1-3 fei1-8 fei1-2 fei1-1
1230	5) Adjust homophone and polyphone characters as needed for easy coding
1231	  discrimination;
1232	6) Code each of the above symbol in the order prepared above:
1233	  fei1-20   fei1f0  <fly>  (radical)
1234	  fei1-3   fei1b0  <not>   (radical)
1235	  fei1-8  fei1nv1yi0  <concuban>
1236	  fei1-2  fei1caot2fei0 <poor>
1237	  fei1-1  fei1ko1fei0 <fei>
1238	  such that the front radical or character gets a shorter name;
1239	7) Identify semantically equivalent character set, and assign only one
1240	 character per set to IDN identifier.

1242	Additional care MUST be applied in above process for future application
1243	system  developments:
1244	1) Reserve the polyphones opted out from Naming Process 2) and 5) above
1245	  for other applications, for example user input processing, not
1246	  discussed in IDN-map [IDNmap] but indicated in [SLS].
1247	2) Reserve the members of semantically equivalent character set from
1248	  Naming Process 7) above for other applications, for example IDN name
1249	  display processing, which are not discussed in IDN-map [IDNmap], but
1250	  indicated in [SLS].
1251	3) For non-character radicals one may fall onto in Naming Process 6),
1252	  a multi-syllabic name may be shorten with conventions specified in
1253	  Section 2.7.2, for example, �cao zi tou� is shorten to �caot� in
1254	  �fei1-2 fei1caot2fei0 <poor>� above.

1256	It is RECOMMENDED that the glyph transliteration process of CJK
1257	Characters DOES NOT bind by any particular radical list, which are only
1258	references as historical character decompositions. This introduces
1259	Transliteration modification #3 to UNICODE document, CJK radicals and
1260	radical supplement: U+2f00 to U+2fd5 and U+2e80 to 2ef3.

1262	Other limitations posted by IDN system application are discussed in
1263	[Stone] Section 3. Observing limitations and follows the above coding
1264	process and sort out equivalent character set phonetically and
1265	semantically is REQUIRED as the first step to tame �A Tangled Web�
1266	[RFC 2825].

1268	2.7.4 Use of character transliteration

1270	It was a struggle to decide to put a full description of a Han character
1271	as its encoding or as its index, until the recent release of a wrist
1272	watch sized computer. It is clear that such a full description of a
1273	character will benefit symbolic processing greatly. For example, an
1274	automated voiced teaching tool may generate instructions on characters
1275	directly from the transliteration.  IDN registration software can extract
1276	a DNS identifier from a  full character description if such a holocode is
1277	available for access. For example, from the following IDN radicals and
1278	icons:
1279	          U+5b59   sun1zi1xiao0
1280	      U+597d   hao3nv1zi0
1281	      U+5c16   jian1xiao2da0
1282	      U+5b50   zi3z0
1283	      U+5937   da4d0
1284	      U+5b0f   xiao3x0
1285	      U+5973   nv3n0

1287	It is easy to extract a transliterated word from the first part of the
1288	above listed StepCode, and get the word �haoxiaozi�.  It is just as easy
1289	to match the second part, the radical transliteration only, to refer back
1290	to the character��s pronunciation. This is a hint for another type of user
1291	friendly input glyph processing.

1293	2.8 Mixed Script Transformation �CImplementing Japanese Tag

1295	Japanese using different phonetic system, its homophone list would be
1296	different with that of Chinese, but the coding procedure described in
1297	Section 2.7.3 SHOULD be the same.

1299	Section 2.7 concerning keeping one format for two types of characters,
1300	the radicals and icons of the same script. Japanese uses two different
1301	scripts from two script groups, kana and Kanji. Since Kana are IDN
1302	letters, and digits are diacritical marks of the letter preceded and
1303	appear at non-regular places, only digit 0 is reserved as delimiter. To
1304	include a Kanji among IDN letters, the rule of delimiter 0 SHOULD be
1305	applied as discussed in Section 2.5. For example, the Japanese section
1306	map:

1308	U+3055      sa            <kana>
1309	U+30fc      1	        <diacritic macron>
1310	U+3073      bi            <kana>
1311	U+3059      su            <kana>
1312	U+????      gyo1go0       <Kanji business>

1314	Thus the DNS name �sa1bisu0gyo1go0� is readily available to be composed
1315	from these transliterated glyph codes.

1317	3. Numerical Symbol Value Assignments

1319	Though, it can be argued even among native speakers regarding a sound
1320	value of a symbol, the domain name identifiers only have 26 letters
1321	and some reasonable combinations within a script. These are the primary
1322	sound elements of a script in any case. Some changes to the primary
1323	sound elements are conventionally represented by modification marks
1324	to a primary symbol. Some modifications are significant and can be
1325	transcribed by a vowel from an alphabet system such as in Arabic. Others
1326	may be represented by a diacritics, as they are in French. UNICODE has
1327	provided clear separation along this line and some instructions on the
1328	functions of modification marks.

1330	Unicode also has listed more than 64 general diacritical marks, U+0300 to
1331	U+0340, while the use of them in a language is not more than 12 by
1332	[Translit 97], (Hindi 12, Ottoman Turkish 11, Azerbaijani and Telugu both
1333	have used 10). Among the usage, the under-letter diacritical marks can be
1334	reflected in letters by conventional transliteration methods used in
1335	dictionaries and text books as shown in Hindi transliteration Table of
1336	Sec. 2.6, so that none of them will need more than 9 diacritical marks.
1337	It is REQUIRED that digit 0 is reserved as icon delimiter from
1338	diacritical mark functions.

1340	Transliteration modification #4 to UNICODE document is to use a digit
1341	to represent diacritic like features, or secondary sound values of
1342	a script.

1344	A digit has no universal sound value associated to it like that of a
1345	Latin letter. It is a good word separator and a less confusing
1346	diacritical mark than that of a letter. For scripts have frequent use
1347	of diacritics, it is RECOMMENDED to use digit in place of a diacritic
1348	mark in a normalized string. For syllabic scripts, it is RECOMMENDED
1349	to use digits at the end of an IDN identifier to indicate a semantic unit
1350	and the number of IDN identifiers in a transliterated string as shown in
1351	Section 2.

1353	Although 26x10 is a two dimensional map, it can be filled with more than
1354	two phonetic aspects of a script.  With increased complexity, the
1355	mnemonic value diminishes gradually. For simplicity, four phonetic
1356	mapping rules SHOULD be observed: R1. Diacritic mark mapping; R2. Phoneme
1357	Mapping; R3. Overflow consecutive slot mapping; R4. Priority elements
1358	mapping.

1360	3.1 Diacritic Mark Mapping

1362	[R1] Graphic based Diacritics mapping. For some scripts a
1363	secondary phonetic elements have to be marked for their users.
1364	For example European scripts, a simple diacritics mapping is
1365	RECOMMENDED, where the digits MAY denote common diacritics, tones
1366	and suprasegmentals.

1368		Tone mark		Diacritics
1369	0	no tone		voiceless (o)
1370	1	flat/high(-)/long	macron (-)
1371	2	global rise (/) 	acute	(/)
1372	3	dip and rising (v)    breve (v)
1373	4	global fall (\)	grave (\)
1374	5	thrill (~)		tilde (~)
1375	6	rising-falling(^)	circumflex (^)
1376	7				umlaut( " )
1377	8	user assign		cedilla (hook)
1378	9	user assign		user assign

1380	Table 6. General Diacritics Mapping Table

1382	The assignment depends on four factors: 1) current user base with respect
1383	to keyboard assignment, 2) the number of marks in a script from a
1384	published dictionary, 3) IPA [IPA] value, 4) first come and first serve.

1386	The above assignments due to:
1387	1) #0 is reserved as icon delimiter;
1388	2) #1 �C 4 due to common naming as first, second, third and fourth tone in
1389	  Chinese;
1390	3) #6 for common Qwerty keyboard assignment;
1391	4) #5 and 7 for frequent appearance in Russian, German, Spanish and
1392	   Vietnamese;
1393	5) #8 a place holder for under-letter diacritic mark for Arabic and Hindi
1394	  languages.
1395	6) #9 for possible inclusion of Overflow symbol set assignment shown in
1396	  Section 2.5.

1398	The position of a similar marks are RECOMMENDED to stay in its
1399	respective position for ease interoperation cross script boundary and
1400	also for users looking for replacement marks. A French diacritical mark
1401	assignment is in Table 6.

1403	French has less than eight but more than four diacritic marks,
1404	it is an example of phonetic mapping [R1].

1406	fr-
1407	0	no tone
1408	1	Silent or Liaison '
1409	2	rise/acute (/)
1410	3	(dip/breve is not used)
1411	4	drop/grave (\)
1412	5	thrill/tilde (~)
1413	6	throw/circumflex (^)
1414	7	dieresis (")
1415	8	Supercript or nasal n
1416	9	(not used for French)

1418	Table 7. French Example of Using Diacritics mapping.

1420	The French diacritical mark assignment is an example to demonstrate the
1421	usage of Table 6, not a French tag implementation. The fr- tag format is
1422	used for consistent presentation in this document.

1424	For scripts in consonant system, a subset of marks is RECOMMENDED to be
1425	mapped to ASCII letters as its first choice, while the rest MAY be
1426	assigned a digit.  Letters have associated sound values and easier for a
1427	non-native speaker to attach its IPA label association. A digit is better
1428	used for separating a secondary property from its primary sound based on
1429	IPA definitions. An Arabic example assignment is provided in [Mnemonics].

1431	3.2 Phoneme Table

1433	[R2] Sound based phoneme table mapping, where each digit specifies
1434	a variant of a base phoneme, and a maximum of nine variants may be
1435	accommodated. This rule has a best mnemonic result cross different
1436	scripts. For example, IPA symbol mapping for English in Table 8.

1438	ipa-
1439	0	1		2		3

1441	a 	U+0251 	ae U+00e6	 U+0292
1442	b
1443	c	ch U+02a7
1444	d
1445	e	U+025b 	.e U+0259	.e: U+025c
1446	f
1447	g
1448	h
1449	i
1450	j	U+02a4
1451	k
1452	l
1453	m
1454	n	ng U+014b
1455	o	U+0252	o: U+0254
1456	p
1457	q
1458	r
1459	s	sh U+0283
1460	t	th U+03b8	U+00f0
1461	u 	U+028c   	U+028a  	U+0075
1462	v
1463	w
1464	x
1465	y
1466	z	zh U+0292

1468	Table 8. Exampe English Phoneme Mapping

1470	IPA symbol mapping for English has used four variants. The Unicode
1471	code point indicates the IPA symbols where an ASCII symbol can not be
1472	found.

1474	A full set of IPA symbol Phoneme mapping is provided in [Mnemonics] for
1475	references.

1477	3.3 Overflowing

1479	[R3] Overflow Symbol mapping - where the symbols SHOULD fill
1480	in only consecutive slots in the opposite directions
1481	in the 26 x 10 table for ease of index computation, where the middle
1482	section of the table SHOULD be left for user selected
1483	definitions. This rule is suited for two sets of corresponding
1484	symbols of the similar scripts, for example Latin and Greek, Indian
1485	scripts. A Chinese version is shown in Table 9 for the method only, not
1486	in any way to suggest such an assignment.

1488	zh-
1489		0	no tone
1490		1	flat/macron (-)
1491		2	rise/acute (/)
1492		3	dip/breve (v)
1493		4	drop/grave (\)

1495		5	classic character drop/grave (\)
1496		6	classic character dip/breve (v)
1497		7	classic character rise/acute (/)
1498		8	classic character flat/macron (-)
1499		9	classic character no tone

1501	Table 9. Example use of Overflowing slot mapping.

1503	The above Overflow and Tone Mark mapping architecture, [R1-R3],
1504	partitions the 26 x 10 table to symmetric two different glyph sets.

1506	3.4 Priority List

1508	[R4] Priority elements mapping - Selecting a set of often used
1509	symbols to be placed in the table. For example:

1511	[R1-R3-R4]
1512	en-
1513	0	a-z
1514	1	flat/macron (-)
1515	2	rise/acute (/)
1516	3	dip/breve (v)
1517	4	drop/grave (\)
1518	5	thrill/tilde (~)
1519	6	throw/circumflex (^)
1520	7	dieresis (")
1521	8	Dingbats
1522	9	A-Z

1524		0	8 (Dingbats)
1525		a 	U+2604	/*areo or comet*/
1526		b
1527		c	U+24b8	/*copyright*/
1528		d	U+25ca	/*diamond*/
1529		e	U+24d4 	/*eletron*/
1530		f	U+2709	/*fly*/
1531		g
1532		h	U+2624	/*health or Caduceus*/
1533		i	U+261e  /*index or white right pointing index*/
1534		j
1535		k	U+2654	/*king*/
1536		l	U+2661	/*love or white heart suit*/
1537		m	U+2709	/*mail or envelope*/
1538		n	U+266b	/*note or Barred eighth note*/
1539		o
1540		p	U+262e	/*peace symbol*/
1541		q	U+2655	/*queen*/
1542		r	U+2602	/*rain or umbrella */
1543		s	U+263a	/*smile*/
1544		t	U+231a	/*time or watch*/
1545		u 	U+2328 	/*utility or keyboard*/
1546		v	U+260e	/*voice or phone*/
1547		w	U+270d	/*writing*/
1548		x
1549		y	U+262f	/* yinyang */
1550		z

1552	Table 10. Example use of Priority Mapping.

1554	In fact, example Table 10 is general Latin script assignment, except the
1555	dingbats mnemonic values are keyed on English.  DNS name resolver treats
1556	uppercase same as lower case, it provides no additional way for users
1557	to assign any specific value to upper case letters. One way to expand
1558	the symbol set allowed in DNS is to use [R3] as in Table 10. The English
1559	mapping assignment above takes rules [R1-R3-R4].

1561	The above assignment rules MAY be used in a combination according
1562	to an order of weights in such an assignment.  Such an order of weights
1563	SHOULD be specified in the form [Rx-Ry-Rz-R4] in front of a
1564	transliteration table of a language tag in form of comments.

1566	3.5 Digits as Radical Layout Indicators

1568	A unified CJK character is often a composition of several independent
1569	symbols from the script. It is possible to describe a CJK character by
1570	representing a character with only its radicals. Although it can identify
1571	a character uniquely, normally it is accompanied with a number of rules
1572	with too many exceptions for the majority of users to comprehend.
1573	StepCode encoding has reduced the complexity of the rules by considering
1574	a CJK character as a simple grid of 1 to 10 units. Naming the 1 to 10
1575	units in a linear fashion results a linear representation of the glyph or
1576	its encoding.

1578	The order of prioritizing radicals of a character is important. In
1579	general, the radical that one writes it with a pen containing the first
1580	stroke of a symbol in printing manner, which is publicized as part of a
1581	national education system is the �primary radical� of the symbol. For
1582	example the character �xin <new>� (the digit is the tone of the character,
1583	hereafter) has two radicals:

1585	1) �qin1 <intimate>� + �jin1 <a half kilogram>�

1587	Since �qin1� may be considered as two radicals as well, the radicals
1588	list may be in the following form too:

1590	2) �li4 <stand>� + �jin1<a half kilogram>� + �mu4 <wood>�

1592	or with different radical ordering:

1594	3) �li4 <stand>� + �mu4 <wood>� + �jin1 <a half kilogram>�

1596	In this case the �qin1� or �li4� both may be the primary radical
1597	dependent to which viewpoint of the user takes, which may be address
1598	in a different document. StepCode protocol favors 1) as discussed in
1599	Section 2.7.

1601	Variation in Radical transliteration can result in multiple
1602	StepCodes to one character within the same tagged map. It is due
1603	to 1) Radical transliteration is usually used as secondary
1604	representation of a character, however sometimes it may be used as
1605	its primary representation, when the correct sound of a character
1606	is not available to the user. 2) When viewing a character as a grid,
1607	there are disagreements on the number of units in a character. For
1608	domain names, the point of views in describing compositions of a
1609	character for a domain name MUST be limited to only one major
1610	viewpoint. The minor viewpoints SHOULD be converted to the major
1611	viewpoint, and radical transliteration MAY be the key to locate
1612	its character transliteration part through user interface when a
1613	name is registered.

1615	The digits in radical transliteration specifying how a radical of a glyph
1616	on its grid is related to the next radical, are called layout digits.
1617	Layout digits specify the relation to the next radical in line.  The left
1618	and right direction are defined by a user's left or right hand while
1619	sitting in front of a display screen or a piece of paper.

1621	The glyph layout digits are:
1622		0 - end of a character or a radical
1623		1 - to its right
1624		2 - to its underside
1625		3 - to contain the following
1626		4 - to divide the following
1627		5 - to its left
1628		6 - to its top

1630		The following selectable digits are to specify additional
1631	glyphs of the script and directions of layout.

1633		7 - to overlay itself with X then to its right;
1634		8 - to overlay itself with X then to its left;
1635		9 - to overlay itself with X then to its underside.

1637	Table 11. Glyph Layout Numeral Values

1639	The radical layout scheme trades complexity of a glyph with code length,
1640	such that the complexity can be left out when an application only needs
1641	the character transliteration.

1643	4. Language Specific Procedures

1645	Either, StepCode may be obtained directly from local display codes to
1646	StepCode phrase conversion tables or to be taken from IDN identifier of
1647	language tagged section maps. Or, it inputs directly from keyboards,
1648	where an input processing module verifies correctness of intended glyphs
1649	and normalizes a StepCode. [Appendix] is an example of such cached input
1650	processing procedure.

1652	Different scripts have different transliterations published worldwide.
1653	These publications are the base for implementing tagged maps and tagged
1654	conversions as discussed in previous sections.

1656	4.1 IDN Input Normalization Procedures

1658	The protocol contains two pairs of conversion and reversion procedures
1659	per language tag supported(See [IDNmap] Section 4.3) and calls for a
1660	minimum number of semantic independent symbols of a language to be mapped
1661	onto a Latin alphabet in a mnemonic manner (Section 2.7.3). The first
1662	pair of conversion and reversion procedures are convert language specific
1663	presentation form to a normalized form and vice versa, named as Normalize
1664	and Present procedures respectively and have been described for
1665	Latin, Arabic and Chinese script implementation in [UAX 15][Bidi][Icdn].

1667	4.2 DNS Fitting Procedures

1669	The second pair of language specific procedures converts a list of
1670	transliterated symbols to a name unit, either it is a word or a phrase or
1671	an identifier of any kinds, to fit into a desired format for any
1672	artificial goals with restrictions that format has to be reversible back
1673	into the list of transliterated symbols in its corresponding decomposing
1674	procedure. The pair of procedures is called Fitting and Decompose
1675	respectively.

1677	The purpose of assembling a StepCode is to be disassembled at its
1678	end of wire travel and indexing back into a tagged map, such that the
1679	pre-converted local display codes can be retrieved in an equivalent
1680	local display code worldwide. For some StepCode, when a list of
1681	character transliteration is combine into a string, it blurs the
1682	pre-converted symbol boundary, which is significant in their
1683	semantic differences, and interferes with correctly disassembling
1684	a StepCode string. It is RECOMMENDED in such a case, a hyphen, �-�,
1685	is added as the last reserved character separator.

1687	When a post-converted string contains mixed scripts, for example
1688	Japanese domain names, exceeds maximum label length, it is only the
1689	characters with radical transliteration MAY be dropped. The truncated
1690	radical transliteration SHOULD reinsert a digit ��0�� to mark the end
1691	of radical transliteration, or using transmission protocols decided by
1692	network group among servers on how to deal with code length exceeding
1693	the DNS label maximum, or other protocols specific to a language
1694	tag to recover, partial recover or intelligent guesses in preventing
1695	confusion when it is decomposed.

1697	Possible protocols for Fitting/Decomposing procedures depend on the scale
1698	of such format to be placed.
1699	1) Zone records: IDN zoned record keeping at IDN name registration locale;
1700	2) Caches: Cached traffic records at client sites;
1701	3) Exceptions: Exception handling rules implemented by protocols;
1702	4) Markers: Symbolic marker interpretation for specific language tag;
1703	5) Models: Embedded linguistic rule interpretation in Fitting/Decomposing
1704	   programming languages.

1706	It is RECOMMENDED, that each language tagged procedure SHOULD specify
1707	which protocol type is implemented and what their effects are for world
1708	wide basic code maintenance.

1710	StepCode string is assembled with orders consistent with keyboard
1711	input, regardless it how it would be displayed on a screen or in
1712	URI [URI]. For some scripts, its character display order may be
1713	rearranged. Such a display order is implied by tagged display procedure,
1714	and is not a part of character transliteration nor a part of radical
1715	transliteration. Layout digits apply to layout directions within a
1716	character space as defined by UNICODE, NOT between characters.

1718	5. Embodiment of StepCode Protocol

1720	Symbolic representation in machine format with mnemonic label for human
1721	readers is a basic technique to improve human control over programs. With
1722	such a control of large name base, many artificial intelligence type of
1723	applications can benefit from it. For example, the mnemonic indexing
1724	system for UCS discussed in [IDNmap] may be extended to sort and index on
1725	trademarks and icons for automatic access needed in [WIPO].

1727	A very much needed universal keyboard access to the full spectrum
1728	of code points in UCS becomes feasible. Imagine that a user pickups a
1729	language tag from a pull-down window, and then types in the keys from a
1730	Latin alphabet labeled keyboard, gets the typed alphabet showing on the
1731	screen for the first level of input verification, and then looks at the
1732	transliteration to symbol conversion to get the second level, �spelling�
1733	verification. (A dream that the author has had for more than 15 years.)

1735	Since StepCode preserves the complete character information, it is a
1736	holocode scheme of a symbol. From which one may extract a set of radicals
1737	to infer the content of a discourse. For example, by recognizing large
1738	presence of �shui3 <water>� radical, one may infer a water body context.
1739	With such type of inference, a semantic net is not too far for reach.

1741	6. Security Considerations

1743	Much of the security of the Internet relies on the DNS. Thus, any
1744	change to the characteristics of the DNS can change the security of
1745	much of the Internet. Thus, StepCode makes no changes to the DNS
1746	itself.

1748	Hostnames are used by users to connect to Internet servers. The
1749	security of the Internet would be compromised if a user entering a
1750	single internationalized name could be connected to different
1751	servers based on different interpretations of the internationalized
1752	hostname. Thus the restriction of DNS names to a small symbol set is
1753	necessary and effective, where adding any other data format only
1754	opens the security gate to complications.

1756	7.Internationalization considerations

1758	StepCode is designed so that every internationalized hostname part can
1759	be represented as one and only one DNS-compatible string. If there
1760	are two different ways to obtain the same glyph on a display device,
1761	then they are still two distinct hostnames, with no bearing on DNS
1762	security issues. If there is any way to follow the steps in this
1763	document and get two or more different results, it is because of an
1764	error in the domain name registration process, where one domain name
1765	registrar fails to update other domain name registrar servers about a
1766	newly registered and well researched hostname.

1768	StepCode using only [a-z0-9] as the basic symbol set is linguistics
1769	sounding choice. Since the base classification used by IPA is Latin
1770	symbol set, the only authoritative study on the subject. The symbol set
1771	has been successfully applied to majority of languages on earth, and
1772	have been proven an effective set of symbols for people of many native
1773	tones to remember and to map to, shown by existing vast quantity of
1774	national standards and dictionaries. Thus [a-z0-9] is the best set of
1775	symbols to be used for universal mnemonic applications of any kind
1776	involving human records. StepCode is a symbol organization scheme to
1777	connect the symbol set to these applications.

1779	8. References

1781	[ASCII] American National Standards Institute (formerly United
1782	   States of America Standards Institute), X3.4, 1968, "USA Code for
1783	   Information Interchange". (ANSI X3.4-1968)

1785	[CJK] James SENG and etc. �Han Ideograph (CJK) for Internationalized
1786	  Domain Names�, draft-ietf-idn-cjk-01.txt, 11th Apr 2001.

1788	[DeFrancis 1989] John DeFrancis, "Visible Speech - The Diverse
1789		Oneness of Writing Systems", 1989, ISBN 0-8248-1207-7.

1791	[Dictionary79] Beijing Foriegn Language Dept., "A Chinese-English
1792		Dictionary", 1979, BK# 9017.810.

1794	[Icdn] Xiang Deng and Yan Fang Wang, "The Implementation of Chinese character
1795	  in IDN", draft-ietf-idn-icdn-00.txt, July 2001.

1797	[IDNReq] Zita Wenzel and James Seng, "Requirements of Internationalized
1798		Domain Names", draft-ietf-idn-requirements. May 2001.)

1800	[IPA] The International Phonetic Alphabet, http://www2.arts.gla.ac.uk/IPA
1801		1996.

1803	[ISO639][ISO639-2/T] ISO/IEC 639-2 2001 Codes for the Representation of
1804		Names of Languages.

1806	[ISO10646]  ISO/IEC 10646-1:2000 (note that an amendment 1 is in
1807	            preparation), ISO/IEC 10646-2 (in preparation), plus
1808	            corrigenda and amendments to these standards.

1810	[Hindi 98] "Hindi & Urdu Phrase Book", Lonely Planet Publications, 1998,
1811	     ISBN 0-86442-425-6.

1813	[Translit 97] Barry, Randall K. 1997. ALA-LC romanization tables:
1814	    transliteration schemes for non-Roman scripts. Washington: Library
1815	    of Congress Cataloging Distribution Service. ISBN 0-8444-0940-5

1817	[PinyinCon] Library of Congress Pinyin Conversion Project, �New Chinese
1818	   Romanization Guidelines�,
1819	   http://lcweb.loc.gov/catdir/pinyin/romcover.html#7

1821	[Macmillan93] The Macmillan Visual Desk Reference, 1993,
1822		ISBN 0-02-531310-x.

1824	[Mnemonics] Liana Ye, �Mnemonic Symbol Mapping of UCS�.

1826	[RFC 2026] S. Bradner, �The Internet Standards Process -- Revision 3�,
1827	    1996, RFC 2026.

1829	[RFC2119] Scott Bradner, "Key words for use in RFCs to Indicate
1830		Requirement Levels", March 1997, RFC 2119.

1832	[RFC2277]   "IETF Policy on Character Sets and Languages",
1833	            rfc2277.txt, January 1998, H. Alvestrand.

1835	[RFC2396] Tim Berners-Lee, et. al., "Uniform Resource Identifiers (URI):
1836	   Generic Syntax", August 1998, RFC 2396.

1838	[Russian 44] "New Russian-English and English-Russian Dictionary", Dover
1839	   Publications, New York, 1944, ISBN 0-486-20208-9.

1841	[SIS] M. Mealling & L. Daigle, �Service Lookup System (SLS)�
1842	   draft-mealling-sls-00.txt

1844	[STD13] Paul Mockapetris, "Domain names - implementation and
1845		specification", November 1987, STD 13 (RFC 1035).

1847	[RFC2825] L. Daigle, Ed. �A Tangled Web: Issues of I18N, Domain Names,
1848	      and the Other Internet protocols�, May 2000, RFC 2825.

1850	[UAX15] Mark Davis and Martin Duerst. Unicode Standard Annex #15:
1851	   �Unicode Normalization Forms�, Version 3.1.0.
1852	    <http://www.unicode.org/unicode/reports/tr15/tr15-21.html>

1854	[UNICODE] The Unicode Consortium, "The Unicode Standard". Described at
1855	            http://www.unicode.org/unicode/standard/versions/.

1857	[UNICODE30] The Unicode Consortium, "The Unicode Standard -- Version
1858	            3.0", ISBN 0-201-61633-5. Same repertoire as ISO/IEC
1859	            10646-1:2000. Described at http://www.unicode.org/unicode/
1860	            standard/versions/Unicode3.0.html.

1862	[URI] Roy Fielding et al., "Uniform Resource Identifiers: Generic
1863	    Syntax", August 998, RFC 2396.

1865	[Versions] Marc Blanchet, �Handling versions of internationalized domain
1866	    names protocols�, draft-ietf-idn-version-00.txt, October 26, 2000.

1868	[WIPO]  �The Role of Technical Measures�,  RFC3,
1869	           http://wipo2.wipo.int/process2/rfc/rfc3/index.html

1871	[WORLD 95] �The world Almanac and Book of Facts 1995�, ISBN 0-88687-766-0

1873	[Ye95] Liana Ye, "A Language Oriented Chinese Encoding for Multilingual
1874	    Computing Environments", in "Proceeding of the 1995 International
1875	    Conference on Computer Processing of Oriental Languages", Page 323.

1877	9. Acknowledgements

1879	The author has benefited from special comments and suggestions from
1880	Aaron Irvine, John C Klensin, Eric Brunner-Williams, Erik Nordmark and
1881	William Davis and relevant discussions from IDN Working Group to improve
1882	this document.

1884	10. IANA Considerations

1886	This document requires IANA action for availability of script tag,
1887	and registration for each tag and possibly its sub-field for phonetic
1888	system used, and readiness of associated language specific procedures.

1890	11. Authors' Contact Information

1892	Liana Ye
1893	Y&D ISG
1894	2607 Read Ave.
1895	Belmont, CA 94002, USA.
1896	(650) 592-7092
1897	liana.ydisg@juno.com

1899	Expires March 2002

1901	[Appendix] StepCode keyboard input process for Chinese

1903	/* buff.c  StepCode processor interface   Copyright Y&D ISG, Inc. 1994
1904	 *-----------------------------------------------------------------------*
1905	 *  find_gly  find a glyph online.
1906	 *  find_wd   find a word online.
1907	 */

1909	#include <stdio.h>
1910	#include <ctype.h>
1911	#include "steplib.h"

1913	int auto_learn= TRUE;
1914	int udic_large= FALSE;
1915	int udic_database= FALSE;
1916	int odic_expand = FALSE;
1917	int dic_saved = FALSE;
1918	int keyboard_in = TRUE;
1919	int alt_memb = 2;	/* extra members of a poly-code to be recorded */

1921	/*
1922	 * find_gly  using a StepCode to find the GB code for display a glyph.
1923	 */
1924	int find_gly(step, stepcd, infor, gb, key)
1925		char *step, *stepcd, *infor, *gb;
1926		int *key;
1927	{
1928		FILE *bufp;
1929		int linecnt, bytes;
1930		char line[MAXdatalen], *p;
1931		char bufname[FILENAMSIZ];

1933		strncpy(stepcd, step, strlen(step)+1);
1934		if (hit_gly(stepcd, gb))
1935			{ *key=GB; return(A_to_B);}

1937		strncpy(bufname, BUFFILE, FILENAMSIZ);
1938		bufp = (FILE *)fopen(bufname, "w+b");
1939		if( bufp == NULL )
1940		{
1941			strcpy( message, "Buffer file unavailable.");
1942			typo(message, word);
1943			return(ERROR);
1944	  	}
1945		search_dic(STEP, 1, stepcd, bufname, &bufp, &linecnt);
1946		if (linecnt<=0)
1947		{
1948			if(verbose)
1949			typo("No entry found in GB table. You may create one.", step);

1951			fclose(bufp);
1952			return(A_to_ZIL);
1953		}
1954		fseek( bufp, 0L, 0 );		/* to beginning sake read */
1955		if(fgets(line, MAXdatalen,  bufp)== NULL)
1956		{	if(verbose)
1957			fprintf(stderr, "ERROR- buffer file read error.\n");
1958			fclose(bufp);
1959			return(ERROR);
1960		}
1961		sscanf(line, "%s%d%s%s\n", stepcd, key, gb, infor);
1962		hash_gly(stepcd, gb);
1963		fclose(bufp);
1964		if (linecnt>1)
1965		{
1966			return( A_to_N);
1967		}else {
1968			return( A_to_B);
1969		}
1970	}

1972	int find_wd(step, stepcd, infor, gb, cnt, key)
1973		char *step, *stepcd, *infor, *gb;
1974		int cnt, *key;
1975	{
1976		FILE *bufp;
1977		int linecnt;
1978		char line[MAXdatalen], *p;
1979		char bufname[FILENAMSIZ];

1981		strncpy(stepcd, step, strlen(step)+1);
1982		if ( hit_wd(stepcd, gb))
1983			{ *key = GB; return(A_to_B);}

1985		strncpy(bufname, BUFFILE, FILENAMSIZ);
1986		bufp = (FILE *)fopen(bufname, "w+b");
1987		if( bufp == NULL )
1988		{
1989			fprintf( stderr, "Buffer file unavailable.");
1990			return(ERROR);
1991	  	}
1992		search_dic(STEP, cnt, stepcd, bufname, &bufp, &linecnt);
1993		if (linecnt<=0)
1994		{	if (!auto_learn)
1995			{
1996			   if(verbose)
1997				typo("Not found.  You may create the word.", step);
1998			   fclose(bufp);
1999			   return(A_to_ZIL);
2000			}else
2001			{
2002				neww = learnword(cnt, stepcd, gb);
2003				/* Do whatever with neww here */
2004				if(dic_saved)
2005					{
2006						hash_wd(stepcd, gb);
2007						dic_saved = FALSE;
2008					}
2009				else
2010				{
2011				   typo("The new word has not saved.", stepcd);
2012				}
2013				fclose(bufp);
2014				neww = reset_word(neww);
2015				return(ZIL_to_A);
2016			}
2017		}
2018		fseek( bufp, 0L, 0 );		/* to beginning sake read */
2019		fgets(line, MAXdatalen,  bufp);
2020		if(line == NULL)
2021		{
2022			if (ferror(bufp)!=0 && verbose)
2023				fprintf(stderr, "Error during buffer read.\n");
2024			if (feof(bufp) !=0 && verbose)
2025				fprintf(stderr, "Buffer file ended.\n");
2026			clearerr(bufp);
2027			fclose(bufp);
2028			return(A_to_ZIL);
2029		}
2030		sscanf(line, "%s%d%s%s\n", stepcd, key, gb, infor);
2031		hash_wd(stepcd, gb);
2032		fclose(bufp);
2033		if (linecnt>1)
2034		{
2035			return( A_to_N);
2036		}else {
2037			return (A_to_B);
2038		}
2039	}

2041	/* --------------------------------------------------------------------
2042	 * Figure out the number of glyphs in a word. The next two routines are
2043	 * based on PINYIN system.
2044	 */
2045	int one_letter_sound(word)
2046		char *word;
2047	{
2048		int cnt=0;
2049		char *w, *v;

2051		w=word;
2052		while (*w=='m'||*w=='M'||*w=='n'||*w=='N')
2053				{ ++cnt; ++w;}
2054		if (cnt>0)
2055		{
2056			v = w; --v;
2057			if((*w=='g'||*w=='G')&& (*v=='n'||*v=='N'))
2058				++w;	/*ex: mng nnng*/
2059		}
2060		if(cnt==0) while (*w=='a'||*w=='A'){ ++cnt; ++w;}
2061		if(cnt==0) while (*w=='o'||*w=='O'){ ++cnt; ++w;}
2062		if(cnt==0) while (*w=='e'||*w=='E'){ ++cnt; ++w;}
2063		if (!isalpha(*w))
2064			return(cnt); /*ex:a aa ooo eee- mmm nmn*/
2065		else cnt=0;		/*ex: an hhh oong */
2066		return(cnt);
2067	}

2069	int tell_word(word)
2070		char *word;
2071	{
2072		char *w, *v;
2073		int  cnt;
2074		cnt=0;

2076		if(!isalpha(*word)) return (NULL);

2078		for (w=word;isalpha(*w);++w); /*skip Pinyin */
2079		while (isdigit(*w)) {cnt++; ++w;} /*count the number of tone marks*/

2081		if (cnt<1)		/*special sigle letter glyph cases*/
2082		{
2083			cnt = one_letter_sound(word);
2084			if (cnt>=1) return(cnt); /* else do syllable analysis */
2085		}
2086		else return(cnt);

2088		/*
2089		 * find the number of syllables by vowel rules
2090		 * This implementation is accuate even without using apostrophe
2091		 */
2092		w=word;
2093		while (isalpha(*w)) /*check the Pinyin only*/
2094		{
2095			switch (*w)
2096			{
2097			case 'a':
2098			case 'i':
2099			case 'e':
2100			case 'o':
2101			case 'u': v=w; ++w; cnt++; /*one vowel case*/
2102				switch (*w)
2103				{
2104				case 'i':
2105				case 'e':
2106				case 'o':
2107				case 'u': ++w;break; /*two vowels sound*/
2108				case 'a': ++w;
2109					if (*v=='u' && *w=='i') break;/*uai*/
2110					if (*v=='i' && *w=='o') break;/*iao*/

2112					else {
2113						--w;     /*still two vowels*/
2114						break;
2115					}
2116				default: break;
2117				}
2118			default:
2119				/*already get out off the compound vowel*/
2120			break;
2121		        }
2122			++w;
2123		}/*check syllables*/
2124		return(cnt);
2125	}

2127	/*
2128	 * --------------------------------------------------------------------
2129	 * Interactive input process procedure
2130	 * --------------------------------------------------------------------
2131	 */
2132	inputp(char *word, char *gb)
2133	{
2134		int  i,  glyphcnt;
2135		char c, *w;
2136		int cnt, key, stat;
2137		char dump[MAXdatalen];

2139		for (;;)
2140		{
2141			*word='\0';
2142			fgets(word, MAXlinelen, stdin);
2143			if (isspace(*word))
2144				break;

2146			/* Check if the entry is a glyph string by */
2147			glyphcnt = tell_word(word);
2148			if (glyphcnt == NULL)
2149			{
2150				printf("%s", *word);
2151				fflush(stdin);
2152				continue;
2153			}

2155			w=word;
2156			while (isalnum(*w)) ++w;
2157			*w = '\0';
2158			if(verbose)
2159				printf("tell_word figure:  %d glyphs\n", glyphcnt);

2161			/* Determin the entry is known through dictionary
2162			 * and cache lookup.
2163			 */
2164			if(glyphcnt >=2)
2165				stat = find_wd(word, stepcd, dump,gb,glyphcnt, &key);
2166			else stat = find_gly(word, stepcd, dump,gb, &key);

2168			/* Print out with GB code */
2169			if (!stat==ERROR) font_code(stepcd, gb, &key, stderr);
2170			if(verbose) printf("%s\n", stepcd);
2171			fflush(stdin);
2172			fflush(stderr);
2173		}
2174		return(0);
2175	}