001    /*
002     *  This file is part of the Jikes RVM project (http://jikesrvm.org).
003     *
004     *  This file is licensed to You under the Eclipse Public License (EPL);
005     *  You may not use this file except in compliance with the License. You
006     *  may obtain a copy of the License at
007     *
008     *      http://www.opensource.org/licenses/eclipse-1.0.php
009     *
010     *  See the COPYRIGHT.txt file distributed with this work for information
011     *  regarding copyright ownership.
012     */
013    package org.jikesrvm.classloader;
014    
015    import java.io.UTFDataFormatException;
016    import java.nio.ByteBuffer;
017    import org.vmmagic.pragma.Pure;
018    import org.jikesrvm.VM;
019    import org.vmmagic.pragma.Inline;
020    import org.vmmagic.pragma.NoInline;
021    
022    /**
023     * Abstract class that contains conversion routines to/from utf8
024     * and/or pseudo-utf8.  It does not support utf8 encodings of
025     * more than 3 bytes.
026     * <p>
027     * The difference between utf8 and pseudo-utf8 is the special
028     * treatment of null.  In utf8, null is encoded as a single byte
029     * directly, whereas in pseudo-utf8, it is encoded as a two-byte
030     * sequence.  See the JVM specification for more information.
031     */
032    public abstract class UTF8Convert {
033    
034      /**
035       * Strictly check the format of the utf8/pseudo-utf8 byte array in
036       * fromUTF8.
037       */
038      static final boolean STRICTLY_CHECK_FORMAT = false;
039      /**
040       * Set fromUTF8 to not throw an exception when given a normal utf8
041       * byte array.
042       */
043      static final boolean ALLOW_NORMAL_UTF8 = false;
044      /**
045       * Set fromUTF8 to not throw an exception when given a pseudo utf8
046       * byte array.
047       */
048      static final boolean ALLOW_PSEUDO_UTF8 = true;
049      /**
050       * Set toUTF8 to write in pseudo-utf8 (rather than normal utf8).
051       */
052      static final boolean WRITE_PSEUDO_UTF8 = true;
053    
054      /**
055       * UTF8 character visitor abstraction
056       */
057      private abstract static class UTF8CharacterVisitor {
058        abstract void visit_char(char c);
059      }
060    
061      /**
062       * Visitor that builds up a char[] as characters are decoded
063       */
064      private static final class ByteArrayStringEncoderVisitor extends UTF8CharacterVisitor {
065        final char[] result;
066        int index;
067        ByteArrayStringEncoderVisitor(int length) {
068          result = new char[length];
069          index = 0;
070        }
071        @Override
072        void visit_char(char c) {
073          result[index] = c;
074          index++;
075        }
076        @Override
077        public String toString() {
078          if (VM.runningVM) {
079            return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
080          } else {
081            return new String(result, 0, index);
082          }
083        }
084      }
085    
086      /**
087       * Visitor that builds up a char[] as characters are decoded
088       */
089      private static final class ByteBufferStringEncoderVisitor extends UTF8CharacterVisitor {
090        final char[] result;
091        int index;
092        ByteBufferStringEncoderVisitor(int length) {
093          result = new char[length];
094          index = 0;
095        }
096        @Override
097        void visit_char(char c) {
098          result[index] = c;
099          index++;
100        }
101        @Override
102        public String toString() {
103          if (VM.runningVM) {
104            return java.lang.JikesRVMSupport.newStringWithoutCopy(result, 0, index);
105          } else {
106            return new String(result, 0, index);
107          }
108        }
109      }
110    
111      /**
112       * Visitor that builds up a String.hashCode form hashCode as characters are decoded
113       */
114      private static final class StringHashCodeVisitor extends UTF8CharacterVisitor {
115        int result = 0;
116        @Override
117        void visit_char(char c) {
118          result = result * 31 + c;
119        }
120        int getResult() {
121          return result;
122        }
123      }
124    
125      /**
126       * Convert the given sequence of (pseudo-)utf8 formatted bytes
127       * into a String.<p>
128       *
129       * The acceptable input formats are controlled by the
130       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
131       * flags.
132       *
133       * @param utf8 (pseudo-)utf8 byte array
134       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
135       * @return unicode string
136       */
137      public static String fromUTF8(byte[] utf8) throws UTFDataFormatException {
138        UTF8CharacterVisitor visitor = new ByteArrayStringEncoderVisitor(utf8.length);
139        visitUTF8(utf8, visitor);
140        return visitor.toString();
141      }
142    
143      /**
144       * Convert the given sequence of (pseudo-)utf8 formatted bytes
145       * into a String.
146       *
147       * The acceptable input formats are controlled by the
148       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
149       * flags.<p>
150       *
151       * @param utf8 (pseudo-)utf8 byte array
152       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
153       * @return unicode string
154       */
155      public static String fromUTF8(ByteBuffer utf8) throws UTFDataFormatException {
156        UTF8CharacterVisitor visitor = new ByteBufferStringEncoderVisitor(utf8.remaining());
157        visitUTF8(utf8, visitor);
158        return visitor.toString();
159      }
160    
161      /**
162       * Convert the given sequence of (pseudo-)utf8 formatted bytes
163       * into a String hashCode.<p>
164       *
165       * The acceptable input formats are controlled by the
166       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
167       * flags.
168       *
169       * @param utf8 (pseudo-)utf8 byte array
170       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
171       * @return hashCode corresponding to if this were a String.hashCode
172       */
173      public static int computeStringHashCode(byte[] utf8) throws UTFDataFormatException {
174        StringHashCodeVisitor visitor = new StringHashCodeVisitor();
175        visitUTF8(utf8, visitor);
176        return visitor.getResult();
177      }
178    
179      /**
180       * Generate exception messages without bloating code
181       */
182      @NoInline
183      private static void throwDataFormatException(String message, int location) throws UTFDataFormatException {
184        throw new UTFDataFormatException(message + " at location " + location);
185      }
186    
187      /**
188       * Visit all bytes of the given utf8 string calling the visitor when a
189       * character is decoded.<p>
190       *
191       * The acceptable input formats are controlled by the
192       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
193       * flags.
194       *
195       * @param utf8 (pseudo-)utf8 byte array
196       * @param visitor called when characters are decoded
197       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
198       */
199      @Inline
200      private static void visitUTF8(byte[] utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
201        for (int i = 0, n = utf8.length; i < n;) {
202          byte b = utf8[i++];
203          if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
204            if (b == 0) {
205              throwDataFormatException("0 byte encountered", i-1);
206            }
207          }
208          if (b >= 0) {  // < 0x80 unsigned
209            // in the range '\001' to '\177'
210            visitor.visit_char((char) b);
211            continue;
212          }
213          try {
214            byte nb = utf8[i++];
215            if (b < -32) {  // < 0xe0 unsigned
216              // '\000' or in the range '\200' to '\u07FF'
217              char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
218              visitor.visit_char(c);
219              if (STRICTLY_CHECK_FORMAT) {
220                if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
221                  throwDataFormatException("invalid marker bits for double byte char" , i-2);
222                }
223                if (c < '\200') {
224                  if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
225                    throwDataFormatException("encountered double byte char that should have been single byte", i-2);
226                  }
227                } else if (c > '\u07FF') {
228                  throwDataFormatException("encountered double byte char that should have been single byte", i-2);
229                }
230              }
231            } else {
232              byte nnb = utf8[i++];
233              // in the range '\u0800' to '\uFFFF'
234              char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
235              visitor.visit_char(c);
236              if (STRICTLY_CHECK_FORMAT) {
237                if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
238                  throwDataFormatException("invalid marker bits for triple byte char", i - 3);
239                }
240                if (c < '\u0800') {
241                  throwDataFormatException("encountered triple byte char that should have been fewer bytes", i - 3);
242                }
243              }
244            }
245          } catch (ArrayIndexOutOfBoundsException e) {
246            throwDataFormatException("unexpected end", i);
247          }
248        }
249      }
250    
251      /**
252       * Visit all bytes of the given utf8 string calling the visitor when a
253       * character is decoded.<p>
254       *
255       * The acceptable input formats are controlled by the
256       * STRICTLY_CHECK_FORMAT, ALLOW_NORMAL_UTF8, and ALLOW_PSEUDO_UTF8
257       * flags.
258       *
259       * @param utf8 (pseudo-)utf8 byte array
260       * @param visitor called when characters are decoded
261       * @throws UTFDataFormatException if the (pseudo-)utf8 byte array is not valid (pseudo-)utf8
262       */
263      @Inline
264      private static void visitUTF8(ByteBuffer utf8, UTF8CharacterVisitor visitor) throws UTFDataFormatException {
265        while (utf8.hasRemaining()) {
266          byte b = utf8.get();
267          if (STRICTLY_CHECK_FORMAT && !ALLOW_NORMAL_UTF8) {
268            if (b == 0) {
269              throwDataFormatException("0 byte encountered", utf8.position() - 1);
270            }
271          }
272          if (b >= 0) {  // < 0x80 unsigned
273            // in the range '\001' to '\177'
274            visitor.visit_char((char) b);
275            continue;
276          }
277          try {
278            byte nb = utf8.get();
279            if (b < -32) {  // < 0xe0 unsigned
280              // '\000' or in the range '\200' to '\u07FF'
281              char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
282              visitor.visit_char(c);
283              if (STRICTLY_CHECK_FORMAT) {
284                if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
285                  throwDataFormatException("invalid marker bits for double byte char", utf8.position() - 2);
286                }
287                if (c < '\200') {
288                  if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
289                    throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
290                  }
291                } else if (c > '\u07FF') {
292                  throwDataFormatException("encountered double byte char that should have been single byte", utf8.position() - 2);
293                }
294              }
295            } else {
296              byte nnb = utf8.get();
297              // in the range '\u0800' to '\uFFFF'
298              char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
299              visitor.visit_char(c);
300              if (STRICTLY_CHECK_FORMAT) {
301                if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
302                  throwDataFormatException("invalid marker bits for triple byte char", utf8.position() - 3);
303                }
304                if (c < '\u0800') {
305                  throwDataFormatException("encountered triple byte char that should have been fewer bytes", utf8.position() - 3);
306                }
307              }
308            }
309          } catch (ArrayIndexOutOfBoundsException e) {
310            throwDataFormatException("unexpected end", utf8.position());
311          }
312        }
313      }
314    
315      /**
316       * Convert the given String into a sequence of (pseudo-)utf8
317       * formatted bytes.<p>
318       *
319       * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
320       *
321       * @param s String to convert
322       * @return array containing sequence of (pseudo-)utf8 formatted bytes
323       */
324      public static byte[] toUTF8(String s) {
325        byte[] result = new byte[utfLength(s)];
326        int result_index = 0;
327        for (int i = 0, n = s.length(); i < n; ++i) {
328          char c = s.charAt(i);
329          // in all shifts below, c is an (unsigned) char,
330          // so either >>> or >> is ok
331          if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
332            result[result_index++] = (byte) c;
333          } else if (c > 0x07FF) {
334            result[result_index++] = (byte) (0xe0 | (byte) (c >> 12));
335            result[result_index++] = (byte) (0x80 | ((c & 0xfc0) >> 6));
336            result[result_index++] = (byte) (0x80 | (c & 0x3f));
337          } else {
338            result[result_index++] = (byte) (0xc0 | (byte) (c >> 6));
339            result[result_index++] = (byte) (0x80 | (c & 0x3f));
340          }
341        }
342        return result;
343      }
344    
345      /**
346       * Convert the given String into a sequence of (pseudo-)utf8
347       * formatted bytes.<p>
348       *
349       * The output format is controlled by the WRITE_PSEUDO_UTF8 flag.
350       *
351       * @param s String to convert
352       * @param b Byte buffer to hold result
353       */
354      @Inline
355      public static void toUTF8(String s, ByteBuffer b) {
356        int result_index = 0;
357        for (int i = 0, n = s.length(); i < n; ++i) {
358          char c = s.charAt(i);
359          // in all shifts below, c is an (unsigned) char,
360          // so either >>> or >> is ok
361          if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
362            b.put((byte) c);
363          } else if (c > 0x07FF) {
364            b.put((byte) (0xe0 | (byte) (c >> 12)));
365            b.put((byte) (0x80 | ((c & 0xfc0) >> 6)));
366            b.put((byte) (0x80 | (c & 0x3f)));
367          } else {
368            b.put((byte) (0xc0 | (byte) (c >> 6)));
369            b.put((byte) (0x80 | (c & 0x3f)));
370          }
371        }
372      }
373    
374      /**
375       * Returns the length of a string's UTF encoded form.
376       */
377      @Pure
378      public static int utfLength(String s) {
379        int utflen = 0;
380        for (int i = 0, n = s.length(); i < n; ++i) {
381          int c = s.charAt(i);
382          if (((!WRITE_PSEUDO_UTF8) || (c >= 0x0001)) && (c <= 0x007F)) {
383            ++utflen;
384          } else if (c > 0x07FF) {
385            utflen += 3;
386          } else {
387            utflen += 2;
388          }
389        }
390        return utflen;
391      }
392    
393      /**
394       * Check whether the given sequence of bytes is valid (pseudo-)utf8.
395       *
396       * @param bytes byte array to check
397       * @return {@code true} iff the given sequence is valid (pseudo-)utf8.
398       */
399      public static boolean check(byte[] bytes) {
400        for (int i = 0, n = bytes.length; i < n;) {
401          byte b = bytes[i++];
402          if (!ALLOW_NORMAL_UTF8) {
403            if (b == 0) return false;
404          }
405          if (b >= 0) {  // < 0x80 unsigned
406            // in the range '\001' to '\177'
407            continue;
408          }
409          try {
410            byte nb = bytes[i++];
411            if (b < -32) {  // < 0xe0 unsigned
412              // '\000' or in the range '\200' to '\u07FF'
413              char c = (char) (((b & 0x1f) << 6) | (nb & 0x3f));
414              if (((b & 0xe0) != 0xc0) || ((nb & 0xc0) != 0x80)) {
415                return false;
416              }
417              if (c < '\200') {
418                if (!ALLOW_PSEUDO_UTF8 || (c != '\000')) {
419                  return false;
420                }
421              } else if (c > '\u07FF') {
422                return false;
423              }
424            } else {
425              byte nnb = bytes[i++];
426              // in the range '\u0800' to '\uFFFF'
427              char c = (char) (((b & 0x0f) << 12) | ((nb & 0x3f) << 6) | (nnb & 0x3f));
428              if (((b & 0xf0) != 0xe0) || ((nb & 0xc0) != 0x80) || ((nnb & 0xc0) != 0x80)) {
429                return false;
430              }
431              if (c < '\u0800') {
432                return false;
433              }
434            }
435          } catch (ArrayIndexOutOfBoundsException e) {
436            return false;
437          }
438        }
439        return true;
440      }
441    }