# The VIetnamese Quoted-Readable encoding (VIQR) # Last edited on 2004-02-28 04:53:48 by stolfi # # According to the VIQR standard, the following characters are subject # to interpretation as Vietnamese diacritical marks: # # ==================================================================== # Diacritic Char ASCII Code Da^'u Example # ---------------------------------------------------------------- # breve ( 0x28, left paren tra(ng ba(n khoa(n # circumflex ^ 0x5E, caret mu~ ho^m nay # horn + 0x2B, plus sign mo'c Qui Nho+n # # acute ' 0x27, apostrophe sa('c La'i Thie^u # grave ` 0x60, backquote huye^`n Bi`nh Du+o+ng # hook-above ? 0x3F, question ho?i Thu? DDu+'c # tilde ~ 0x7E, tilde nga~ di~ va~ng # dot-below . 0x2E, period na(.ng ho.c ta^.p # ==================================================================== # In addition, dd stands for d bar and DD stands for D bar. # # The VIQR standard specifies that a character is interpreted as a # diacritic if and only if it can combine with the previous letter # into a legal Vietnamese character. To prevent combination the # character must be preceded by an escape character, which is usually # the backslash ( \ ) character. # # Examples: # # O+ is a legal Vietnamese letter (O horn), hence + is a diacritic mark. # # A+ is not a legal Vietnamese letter, hence + is a stand-alone plus sign, # not a diacritic mark. # # ... mo+'i ra ddo+`i\. -- the backslash indicates the following period # is a punctuation, not a dot-below diacritic. # # Anh ddi dda^u\? -- the backslash indicates the following # question mark is a punctuation, not a hook-above diacritic. # # The complete VIQR standard specifies that a text stream is always # tagged as being in Vietnamese mode, English mode, or Literal mode: # # A sequence \V signifies that the text following it is in Vietnamese # mode and must be interpreted according to the VIQR convention for # diacritic marks. # # A sequence \M signifies that the text following it is in English # mode. Only sequences beginning with a backslash are interpreted # according to the VIQR convention for diacritic marks. # # A sequence \L signifies that the text following it is in Literal # mode. No interpretation applies, except for the switching mode # sequences \V, \M, \L, and their lower-case counterparts. # # For complete information, please read the official Viet-Std report # published in 1992 at # # http://www.vietstd.org. # # Linux regular expression description of valid VIQR accented-char # combinations: # # escaped for shell (csh): # "/(([Aa][\\(\\^]?)|([Oo][\\+\\^]?)|([Uu][\\+]?)|([Ee][\\^]?)|[IiYy])[`'.\\?~]?/" # # inside gawk: # /(([Aa][\(\^]?)|([Oo][\+\^]?)|([Uu][\+]?)|([Ee][\^]?)|[IiYy])[`\'.\?~]?/ # # Vietnamese Characters under VISCII and VIQR by Encoding Order # # +-------+-------+-------+----------------------------- # |VISCII |VISCII | VIQR | Descriptive # |(dec.) |(oct.) | chars | Name # +-------+-------+-------+----------------------------- # | 065 | 101 | A | uppercase_A # | 193 | 301 | A' | uppercase_A_acute # | 197 | 305 | A( | uppercase_A_breve # | 129 | 201 | A(' | uppercase_A_breve_acute # | 131 | 203 | A(. | uppercase_A_breve_dot-below # | 002 | 002 | A(? | uppercase_A_breve_hook-above # | 130 | 202 | A(` | uppercase_A_breve_grave # | 005 | 005 | A(~ | uppercase_A_breve_tilde # | 128 | 200 | A. | uppercase_A_dot-below # | 196 | 304 | A? | uppercase_A_hook-above # | 194 | 302 | A^ | uppercase_A_circumflex # | 132 | 204 | A^' | uppercase_A_circumflex_acute # | 135 | 207 | A^. | uppercase_A_circumflex_dot-below # | 134 | 206 | A^? | uppercase_A_circumflex_hook-above # | 133 | 205 | A^` | uppercase_A_circumflex_grave # | 006 | 006 | A^~ | uppercase_A_circumflex_tilde # | 192 | 300 | A` | uppercase_A_grave # | 195 | 303 | A~ | uppercase_A_tilde # | 066 | 102 | B | uppercase_B # | 067 | 103 | C | uppercase_C # | 068 | 104 | D | uppercase_D # | 000 | 000 | DD | uppercase_D_bar # | 069 | 105 | E | uppercase_E # | 201 | 311 | E' | uppercase_E_acute # | 137 | 211 | E. | uppercase_E_dot-below # | 203 | 313 | E? | uppercase_E_hook-above # | 202 | 312 | E^ | uppercase_E_circumflex # | 138 | 212 | E^' | uppercase_E_circumflex_acute # | 142 | 216 | E^. | uppercase_E_circumflex_dot-below # | 140 | 214 | E^? | uppercase_E_circumflex_hook-above # | 139 | 213 | E^` | uppercase_E_circumflex_grave # | 141 | 215 | E^~ | uppercase_E_circumflex_tilde # | 200 | 310 | E` | uppercase_E_grave # | 136 | 210 | E~ | uppercase_E_tilde # | 070 | 106 | F | uppercase_F # | 071 | 107 | G | uppercase_G # | 072 | 110 | H | uppercase_H # | 073 | 111 | I | uppercase_I # | 205 | 315 | I' | uppercase_I_acute # | 152 | 230 | I. | uppercase_I_dot-below # | 155 | 233 | I? | uppercase_I_hook-above # | 204 | 314 | I` | uppercase_I_grave # | 206 | 316 | I~ | uppercase_I_tilde # | 074 | 112 | J | uppercase_J # | 075 | 113 | K | uppercase_K # | 076 | 114 | L | uppercase_L # | 077 | 115 | M | uppercase_M # | 078 | 116 | N | uppercase_N # | 079 | 117 | O | uppercase_O # | 211 | 323 | O' | uppercase_O_acute # | 180 | 264 | O+ | lowercase_O_horn # | 149 | 225 | O+' | uppercase_O_horn_acute # | 148 | 224 | O+. | uppercase_O_horn_dot-below # | 151 | 227 | O+? | uppercase_O_horn_hook-above # | 150 | 226 | O+` | uppercase_O_horn_grave # | 179 | 263 | O+~ | lowercase_O_horn_tilde # | 154 | 232 | O. | uppercase_O_dot-below # | 153 | 231 | O? | uppercase_O_hook-above # | 212 | 324 | O^ | uppercase_O_circumflex # | 143 | 217 | O^' | uppercase_O_circumflex_acute # | 147 | 223 | O^. | uppercase_O_circumflex_dot-below # | 145 | 221 | O^? | uppercase_O_circumflex_hook-above # | 144 | 220 | O^` | uppercase_O_circumflex_grave # | 146 | 222 | O^~ | uppercase_O_circumflex_tilde # | 210 | 322 | O` | uppercase_O_grave # | 213 | 325 | O~ | uppercase_O_tilde # | 080 | 120 | P | uppercase_P # | 081 | 121 | Q | uppercase_Q # | 082 | 122 | R | uppercase_R # | 083 | 123 | S | uppercase_S # | 084 | 124 | T | uppercase_T # | 085 | 125 | U | uppercase_U # | 218 | 332 | U' | uppercase_U_acute # | 191 | 277 | U+ | uppercase_U_horn # | 186 | 272 | U+' | uppercase_U_horn_acute # | 185 | 271 | U+. | uppercase_U_horn_dot-below # | 188 | 274 | U+? | uppercase_U_horn_hook-above # | 187 | 273 | U+` | uppercase_U_horn_grave # | 255 | 377 | U+~ | uppercase_U_horn_tilde # | 158 | 236 | U. | uppercase_U_dot-below # | 156 | 234 | U? | uppercase_U_hook-above # | 217 | 331 | U` | uppercase_U_grave # | 157 | 235 | U~ | uppercase_U_tilde # | 086 | 126 | V | uppercase_V # | 087 | 127 | W | uppercase_W # | 088 | 130 | X | uppercase_X # | 089 | 131 | Y | uppercase_Y # | 221 | 335 | Y' | lowercase_Y_acute # | 030 | 036 | Y. | uppercase_Y_dot-below # | 020 | 024 | Y? | uppercase_Y_hook-above # | 159 | 237 | Y` | uppercase_Y_grave # | 025 | 031 | Y~ | uppercase_Y_tilde # | 090 | 132 | Z | uppercase_Z # | 097 | 141 | a | lowercase_a # | 225 | 341 | a' | lowercase_a_acute # | 229 | 345 | a( | lowercase_a_breve # | 161 | 241 | a(' | lowercase_a_breve_acute # | 163 | 243 | a(. | lowercase_a_breve_dot-below # | 198 | 306 | a(? | lowercase_a_breve_hook-above # | 162 | 242 | a(` | lowercase_a_breve_grave # | 199 | 307 | a(~ | lowercase_a_breve_tilde # | 160 | 240 | a. | lowercase_a_dot-below # | 228 | 344 | a? | lowercase_a_hook-above # | 226 | 342 | a^ | lowercase_a_circumflex # | 164 | 244 | a^' | lowercase_a_circumflex_acute # | 167 | 247 | a^. | lowercase_a_circumflex_dot-below # | 166 | 246 | a^? | lowercase_a_circumflex_hook-above # | 165 | 245 | a^` | lowercase_a_circumflex_grave # | 231 | 347 | a^~ | lowercase_a_circumflex_tilde # | 224 | 340 | a` | lowercase_a_grave # | 227 | 343 | a~ | lowercase_a_tilde # | 098 | 142 | b | lowercase_b # | 099 | 143 | c | lowercase_c # | 100 | 144 | d | lowercase_d # | 240 | 360 | dd | lowercase_d_bar # | 101 | 145 | e | lowercase_e # | 233 | 351 | e' | lowercase_e_acute # | 169 | 251 | e. | lowercase_e_dot-below # | 235 | 353 | e? | lowercase_e_hook-above # | 234 | 352 | e^ | lowercase_e_circumflex # | 170 | 252 | e^' | lowercase_e_circumflex_acute # | 174 | 256 | e^. | lowercase_e_circumflex_dot-below # | 172 | 254 | e^? | lowercase_e_circumflex_hook-above # | 171 | 253 | e^` | lowercase_e_circumflex_grave # | 173 | 255 | e^~ | lowercase_e_circumflex_tilde # | 232 | 350 | e` | lowercase_e_grave # | 168 | 250 | e~ | lowercase_e_tilde # | 102 | 146 | f | lowercase_f # | 103 | 147 | g | lowercase_g # | 104 | 150 | h | lowercase_h # | 105 | 151 | i | lowercase_i # | 237 | 355 | i' | lowercase_i_acute # | 184 | 270 | i. | lowercase_i_dot-below # | 239 | 357 | i? | lowercase_i_hook-above # | 236 | 354 | i` | lowercase_i_grave # | 238 | 356 | i~ | lowercase_i_tilde # | 106 | 152 | j | lowercase_j # | 107 | 153 | k | lowercase_k # | 108 | 154 | l | lowercase_l # | 109 | 155 | m | lowercase_m # | 110 | 156 | n | lowercase_n # | 111 | 157 | o | lowercase_o # | 243 | 363 | o' | lowercase_o_acute # | 189 | 275 | o+ | lowercase_o_horn # | 190 | 276 | o+' | lowercase_o_horn_acute # | 254 | 376 | o+. | lowercase_o_horn_dot-below # | 183 | 267 | o+? | lowercase_o_horn_hook-above # | 182 | 266 | o+` | lowercase_o_horn_grave # | 222 | 336 | o+~ | lowercase_o_horn_tilde # | 247 | 367 | o. | lowercase_o_dot-below # | 246 | 366 | o? | lowercase_o_hook-above # | 244 | 364 | o^ | lowercase_o_circumflex # | 175 | 257 | o^' | lowercase_o_circumflex_acute # | 181 | 265 | o^. | lowercase_o_circumflex_dot-below # | 177 | 261 | o^? | lowercase_o_circumflex_hook-above # | 176 | 260 | o^` | lowercase_o_circumflex_grave # | 178 | 262 | o^~ | lowercase_o_circumflex_tilde # | 242 | 362 | o` | lowercase_o_grave # | 245 | 365 | o~ | lowercase_o_tilde # | 112 | 160 | p | lowercase_p # | 113 | 161 | q | lowercase_q # | 114 | 162 | r | lowercase_r # | 115 | 163 | s | lowercase_s # | 116 | 164 | t | lowercase_t # | 117 | 165 | u | lowercase_u # | 250 | 372 | u' | lowercase_u_acute # | 223 | 337 | u+ | lowercase_u_horn # | 209 | 321 | u+' | lowercase_u_horn_acute # | 241 | 361 | u+. | lowercase_u_horn_dot-below # | 216 | 330 | u+? | lowercase_u_horn_hook-above # | 215 | 327 | u+` | lowercase_u_horn_grave # | 230 | 346 | u+~ | lowercase_u_horn_tilde # | 248 | 370 | u. | lowercase_u_dot-below # | 252 | 374 | u? | lowercase_u_hook-above # | 249 | 371 | u` | lowercase_u_grave # | 251 | 373 | u~ | lowercase_u_tilde # | 118 | 166 | v | lowercase_v # | 119 | 167 | w | lowercase_w # | 120 | 170 | x | lowercase_x # | 121 | 171 | y | lowercase_y # | 253 | 375 | y' | lowercase_y_acute # | 220 | 334 | y. | lowercase_y_dot-below # | 214 | 326 | y? | lowercase_y_hook-above # | 207 | 317 | y` | lowercase_y_grave # | 219 | 333 | y~ | lowercase_y_tilde # | 122 | 172 | z | lowercase_z # +-------+-------+-------+----------------------------- # # * VIQR also allows `Ð' to be represented by `Dd' or `dD'.