/* __ *\ ** ________ ___ / / ___ Scala API ** ** / __/ __// _ | / / / _ | (c) 2002-2009, LAMP/EPFL ** ** __\ \/ /__/ __ |/ /__/ __ | http://scala-lang.org/ ** ** /____/\___/_/ |_/____/_/ | | ** ** |/ ** \* */ // $Id: EncodingHeuristics.scala 18387 2009-07-24 15:28:37Z odersky $ package scala.xml package include.sax import scala.xml.include._ import java.io.InputStream import scala.util.matching.Regex /** * <p> * <code>EncodingHeuristics</code> reads from a stream * (which should be buffered) and attempts to guess * what the encoding of the text in the stream is. * If it fails to determine the type of the encoding, * it returns the default UTF-8. * </p> * * @author Burak Emir * @author Paul Phillips */ object EncodingHeuristics { object EncodingNames { // UCS-4 isn't yet implemented in java releases anyway... val bigUCS4 = "UCS-4" val littleUCS4 = "UCS-4" val unusualUCS4 = "UCS-4" val bigUTF16 = "UTF-16BE" val littleUTF16 = "UTF-16LE" val utf8 = "UTF-8" val default = utf8 } import EncodingNames._ /** * <p> * This utility method attempts to determine the XML character encoding * by examining the input stream, as specified here: * http://www.w3.org/TR/xml/#sec-guessing * </p> * * @param in <code>InputStream</code> to read from. * @return String The name of the encoding. * @throws IOException if the stream cannot be reset */ def readEncodingFromStream(in: InputStream): String = { var ret: String = null val bytesToRead = 1024 // enough to read most XML encoding declarations def resetAndRet = { in.reset ; ret } // This may fail if there are a lot of space characters before the end // of the encoding declaration in mark bytesToRead val bytes = (in.read, in.read, in.read, in.read) // first look for byte order mark ret = bytes match { case (0x00, 0x00, 0xFE, 0xFF) => bigUCS4 case (0xFF, 0xFE, 0x00, 0x00) => littleUCS4 case (0x00, 0x00, 0xFF, 0xFE) => unusualUCS4 case (0xFE, 0xFF, 0x00, 0x00) => unusualUCS4 case (0xFE, 0xFF, _ , _ ) => bigUTF16 case (0xFF, 0xFE, _ , _ ) => littleUTF16 case (0xEF, 0xBB, 0xBF, _ ) => utf8 case _ => null } if (ret != null) return resetAndRet def readASCIIEncoding: String = { val data = new Array[Byte](bytesToRead - 4) val length = in.read(data, 0, bytesToRead - 4) // Use Latin-1 (ISO-8859-1) because all byte sequences are legal. val declaration = new String(data, 0, length, "ISO-8859-1") val regexp = """(?m).*?encoding\s*=\s*["'](.+?)['"]""".r (regexp findFirstMatchIn declaration) match { case None => default case Some(md) => md.subgroups(0) } } // no byte order mark present; first character must be '<' or whitespace ret = bytes match { case (0x00, 0x00, 0x00, '<' ) => bigUCS4 case ('<' , 0x00, 0x00, 0x00) => littleUCS4 case (0x00, 0x00, '<' , 0x00) => unusualUCS4 case (0x00, '<' , 0x00, 0x00) => unusualUCS4 case (0x00, '<' , 0x00, '?' ) => bigUTF16 // XXX must read encoding case ('<' , 0x00, '?' , 0x00) => littleUTF16 // XXX must read encoding case ('<' , '?' , 'x' , 'm' ) => readASCIIEncoding case (0x4C, 0x6F, 0xA7, 0x94) => utf8 // XXX EBCDIC case _ => utf8 // no XML or text declaration present } resetAndRet } }