001 /* 002 * This file is part of the Jikes RVM project (http://jikesrvm.org). 003 * 004 * This file is licensed to You under the Eclipse Public License (EPL); 005 * You may not use this file except in compliance with the License. You 006 * may obtain a copy of the License at 007 * 008 * http://www.opensource.org/licenses/eclipse-1.0.php 009 * 010 * See the COPYRIGHT.txt file distributed with this work for information 011 * regarding copyright ownership. 012 */ 013 package org.jikesrvm.classloader; 014 015 import java.io.UTFDataFormatException; 016 import java.nio.ByteBuffer; 017 import org.vmmagic.pragma.Pure; 018 import org.jikesrvm.VM; 019 import org.vmmagic.pragma.Inline; 020 import org.vmmagic.pragma.NoInline; 021 022 /** 023 * Abstract class that contains conversion routines to/from utf8 024 * and/or pseudo-utf8. It does not support utf8 encodings of 025 * more than 3 bytes. 026 * <p> 027 * The difference between utf8 and pseudo-utf8 is the special 028 * treatment of null. In utf8, null is encoded as a single byte 029 * directly, whereas in pseudo-utf8, it is encoded as a two-byte 030 * sequence. See the JVM specification for more information. 031 */ 032 public abstract class UTF8Convert { 033 034 /** 035 * Strictly check the format of the utf8/pseudo-utf8 byte array in 036 * fromUTF8. 037 */ 038 static final boolean STRICTLY_CHECK_FORMAT = false; 039 /** 040 * Set fromUTF8 to not throw an exception when given a normal utf8 041 * byte array. 042 */ 043 static final boolean ALLOW_NORMAL_UTF8 = false; 044 /** 045 * Set fromUTF8 to not throw an exception when given a pseudo utf8 046 * byte array. 047 */ 048 static final boolean ALLOW_PSEUDO_UTF8 = true; 049 /** 050 * Set toUTF8 to write in pseudo-utf8 (rather than normal utf8). 051 */ 052 static final boolean WRITE_PSEUDO_UTF8 = true; 053 054 /** 055 * UTF8 character visitor abstraction 056 */ 057 private abstract static class UTF8CharacterVisitor { 058 abstract void visit_char(char c); 059 } 060 061 /** 062 * Visitor that builds up a char[] as characters are decoded 063 */ 064 private static final class ByteArrayStringEncoderVisitor extends UTF8CharacterVisitor { 065 final char[] result; 066 int index; 067 ByteArrayStringEncoderVisitor(int length) { 068 result = new char[length]; 069 index = 0; 070 } 071 @Override 072 void visit_char(char c) { 073 result[index] = c; 074 index++; 075 } 076 @Override 077 public String toString() { 078 if (VM.runningVM) { 079 return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index); 080 } else { 081 return new String(result, 0, index); 082 } 083 } 084 } 085 086 /** 087 * Visitor that builds up a char[] as characters are decoded 088 */ 089 private static final class ByteBufferStringEncoderVisitor extends UTF8CharacterVisitor { 090 final char[] result; 091 int index; 092 ByteBufferStringEncoderVisitor(int length) { 093 result = new char[length]; 094 index = 0; 095 } 096 @Override 097 void visit_char(char c) { 098 result[index] = c; 099 index++; 100 } 101 @Override 102 public String toString() { 103 if (VM.runningVM) { 104 return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index); 105 } else { 106 return new String(result, 0, index); 107 } 108 } 109 } 110 111 /** 112 * Visitor that builds up a String.hashCode form hashCode as characters are decoded 113 */ 114 private static final class StringHashCodeVisitor extends UTF8CharacterVisitor { 115 int result = 0; 116 @Override 117 void visit_char(char c) { 118 result = result * 31 + c; 119 } 120 int getResult() { 121 return result; 122 } 123 } 124 125 /** 126 * Convert the given sequence of (pseudo-)utf8 formatted bytes 127 * into a String.<p> 128 * 129 * The acceptable input formats are controlled by the 130 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 131 * flags. 132 * 133 * @param utf8 (pseudo-)utf8 byte array 134 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 135 * @return unicode string 136 */ 137 public static String fromUTF8(byte[] utf8) throws UTFDataFormatException { 138 UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length); 139 visitUTF8(utf8, visitor); 140 return visitor.toString(); 141 } 142 143 /** 144 * Convert the given sequence of (pseudo-)utf8 formatted bytes 145 * into a String. 146 * 147 * The acceptable input formats are controlled by the 148 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 149 * flags.<p> 150 * 151 * @param utf8 (pseudo-)utf8 byte array 152 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 153 * @return unicode string 154 */ 155 public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException { 156 UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining()); 157 visitUTF8(utf8, visitor); 158 return visitor.toString(); 159 } 160 161 /** 162 * Convert the given sequence of (pseudo-)utf8 formatted bytes 163 * into a String hashCode.<p> 164 * 165 * The acceptable input formats are controlled by the 166 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 167 * flags. 168 * 169 * @param utf8 (pseudo-)utf8 byte array 170 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 171 * @return hashCode corresponding to if this were a String.hashCode 172 */ 173 public static int computeStringHashCode(byte[] utf8) throws UTFDataFormatException { 174 StringHashCodeVisitor visitor = new StringHashCodeVisitor(); 175 visitUTF8(utf8, visitor); 176 return visitor.getResult(); 177 } 178 179 /** 180 * Generate exception messages without bloating code 181 */ 182 @NoInline 183 private static void throwDataFormatException(String message, int location) throws UTFDataFormatException { 184 throw new UTFDataFormatException(message + " at location " + location); 185 } 186 187 /** 188 * Visit all bytes of the given utf8 string calling the visitor when a 189 * character is decoded.<p> 190 * 191 * The acceptable input formats are controlled by the 192 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 193 * flags. 194 * 195 * @param utf8 (pseudo-)utf8 byte array 196 * @param visitor called when characters are decoded 197 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 198 */ 199 @Inline 200 private static void visitUTF8(byte[] utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException { 201 for (int i = 0, n = utf8.length; i < n;) { 202 byte b = utf8[i++]; 203 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) { 204 if (b == 0) { 205 throwDataFormatException("0 byte encountered", i-1); 206 } 207 } 208 if (b >= 0) { // < 0x80 unsigned 209 // in the range '\001' to '\177' 210 visitor.visit_char((char) b); 211 continue; 212 } 213 try { 214 byte nb = utf8[i++]; 215 if (b < -32) { // < 0xe0 unsigned 216 // '\000' or in the range '\200' to '\u07FF' 217 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f)); 218 visitor.visit_char(c); 219 if (STRICTLY_CHECK_FORMAT) { 220 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) { 221 throwDataFormatException("invalid marker bits for double byte char" , i-2); 222 } 223 if (c < '\200') { 224 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) { 225 throwDataFormatException("encountered double byte char that should have been single byte", i-2); 226 } 227 } else if (c > '\u07FF') { 228 throwDataFormatException("encountered double byte char that should have been single byte", i-2); 229 } 230 } 231 } else { 232 byte nnb = utf8[i++]; 233 // in the range '\u0800' to '\uFFFF' 234 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); 235 visitor.visit_char(c); 236 if (STRICTLY_CHECK_FORMAT) { 237 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) { 238 throwDataFormatException("invalid marker bits for triple byte char", i - 3); 239 } 240 if (c < '\u0800') { 241 throwDataFormatException("encountered triple byte char that should have been fewer bytes", i - 3); 242 } 243 } 244 } 245 } catch (ArrayIndexOutOfBoundsException e) { 246 throwDataFormatException("unexpected end", i); 247 } 248 } 249 } 250 251 /** 252 * Visit all bytes of the given utf8 string calling the visitor when a 253 * character is decoded.<p> 254 * 255 * The acceptable input formats are controlled by the 256 * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8 257 * flags. 258 * 259 * @param utf8 (pseudo-)utf8 byte array 260 * @param visitor called when characters are decoded 261 * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8 262 */ 263 @Inline 264 private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException { 265 while (utf8.hasRemaining()) { 266 byte b = utf8.get(); 267 if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) { 268 if (b == 0) { 269 throwDataFormatException("0 byte encountered", utf8.position() - 1); 270 } 271 } 272 if (b >= 0) { // < 0x80 unsigned 273 // in the range '\001' to '\177' 274 visitor.visit_char((char) b); 275 continue; 276 } 277 try { 278 byte nb = utf8.get(); 279 if (b < -32) { // < 0xe0 unsigned 280 // '\000' or in the range '\200' to '\u07FF' 281 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f)); 282 visitor.visit_char(c); 283 if (STRICTLY_CHECK_FORMAT) { 284 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) { 285 throwDataFormatException("invalid marker bits for double byte char", utf8.position() - 2); 286 } 287 if (c < '\200') { 288 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) { 289 throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2); 290 } 291 } else if (c > '\u07FF') { 292 throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2); 293 } 294 } 295 } else { 296 byte nnb = utf8.get(); 297 // in the range '\u0800' to '\uFFFF' 298 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); 299 visitor.visit_char(c); 300 if (STRICTLY_CHECK_FORMAT) { 301 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) { 302 throwDataFormatException("invalid marker bits for triple byte char", utf8.position() - 3); 303 } 304 if (c < '\u0800') { 305 throwDataFormatException("encountered triple byte char that should have been fewer bytes", utf8.position() - 3); 306 } 307 } 308 } 309 } catch (ArrayIndexOutOfBoundsException e) { 310 throwDataFormatException("unexpected end", utf8.position()); 311 } 312 } 313 } 314 315 /** 316 * Convert the given String into a sequence of (pseudo-)utf8 317 * formatted bytes.<p> 318 * 319 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag. 320 * 321 * @param s String to convert 322 * @return array containing sequence of (pseudo-)utf8 formatted bytes 323 */ 324 public static byte[] toUTF8(String s) { 325 byte[] result = new byte[utfLength(s)]; 326 int result_index = 0; 327 for (int i = 0, n = s.length(); i < n; ++i) { 328 char c = s.charAt(i); 329 // in all shifts below, c is an (unsigned) char, 330 // so either >>> or >> is ok 331 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) { 332 result[result_index++] = (byte) c; 333 } else if (c > 0x07FF) { 334 result[result_index++] = (byte) (0xe0 | (byte) (c >> 12)); 335 result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6)); 336 result[result_index++] = (byte) (0x80 | (c & 0x3f)); 337 } else { 338 result[result_index++] = (byte) (0xc0 | (byte) (c >> 6)); 339 result[result_index++] = (byte) (0x80 | (c & 0x3f)); 340 } 341 } 342 return result; 343 } 344 345 /** 346 * Convert the given String into a sequence of (pseudo-)utf8 347 * formatted bytes.<p> 348 * 349 * The output format is controlled by the WRITE_PSEUDO_UTF8 flag. 350 * 351 * @param s String to convert 352 * @param b Byte buffer to hold result 353 */ 354 @Inline 355 public static void toUTF8(String s, ByteBuffer b) { 356 int result_index = 0; 357 for (int i = 0, n = s.length(); i < n; ++i) { 358 char c = s.charAt(i); 359 // in all shifts below, c is an (unsigned) char, 360 // so either >>> or >> is ok 361 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) { 362 b.put((byte) c); 363 } else if (c > 0x07FF) { 364 b.put((byte) (0xe0 | (byte) (c >> 12))); 365 b.put((byte) (0x80 | ((c & 0xfc0) >> 6))); 366 b.put((byte) (0x80 | (c & 0x3f))); 367 } else { 368 b.put((byte) (0xc0 | (byte) (c >> 6))); 369 b.put((byte) (0x80 | (c & 0x3f))); 370 } 371 } 372 } 373 374 /** 375 * Returns the length of a string's UTF encoded form. 376 */ 377 @Pure 378 public static int utfLength(String s) { 379 int utflen = 0; 380 for (int i = 0, n = s.length(); i < n; ++i) { 381 int c = s.charAt(i); 382 if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) { 383 ++utflen; 384 } else if (c > 0x07FF) { 385 utflen += 3; 386 } else { 387 utflen += 2; 388 } 389 } 390 return utflen; 391 } 392 393 /** 394 * Check whether the given sequence of bytes is valid (pseudo-)utf8. 395 * 396 * @param bytes byte array to check 397 * @return {@code true} iff the given sequence is valid (pseudo-)utf8. 398 */ 399 public static boolean check(byte[] bytes) { 400 for (int i = 0, n = bytes.length; i < n;) { 401 byte b = bytes[i++]; 402 if (!ALLOW_NORMAL_UTF8) { 403 if (b == 0) return false; 404 } 405 if (b >= 0) { // < 0x80 unsigned 406 // in the range '\001' to '\177' 407 continue; 408 } 409 try { 410 byte nb = bytes[i++]; 411 if (b < -32) { // < 0xe0 unsigned 412 // '\000' or in the range '\200' to '\u07FF' 413 char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f)); 414 if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) { 415 return false; 416 } 417 if (c < '\200') { 418 if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) { 419 return false; 420 } 421 } else if (c > '\u07FF') { 422 return false; 423 } 424 } else { 425 byte nnb = bytes[i++]; 426 // in the range '\u0800' to '\uFFFF' 427 char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f)); 428 if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) { 429 return false; 430 } 431 if (c < '\u0800') { 432 return false; 433 } 434 } 435 } catch (ArrayIndexOutOfBoundsException e) { 436 return false; 437 } 438 } 439 return true; 440 } 441 }