/* __ *\ ** ________ ___ / / ___ Scala API ** ** / __/ __// _ | / / / _ | (c) 2003-2009, LAMP/EPFL ** ** __\ \/ /__/ __ |/ /__/ __ | http://scala-lang.org/ ** ** /____/\___/_/ |_/____/_/ | | ** ** |/ ** \* */ // $Id: Utility.scala 18210 2009-07-05 14:46:37Z extempore $ package scala.xml import collection.mutable.{Set, HashSet} import parsing.XhtmlEntities /** * The <code>Utility</code> object provides utility functions for processing * instances of bound and not bound XML classes, as well as escaping text nodes. * * @author Burak Emir */ object Utility extends AnyRef with parsing.TokenTests { implicit def implicitSbToString(sb: StringBuilder) = sb.toString() // helper for the extremely oft-repeated sequence of creating a // StringBuilder, passing it around, and then grabbing its String. private [xml] def sbToString(f: (StringBuilder) => Unit): String = { val sb = new StringBuilder f(sb) sb.toString } private[xml] def isAtomAndNotText(x: Node) = x.isAtom && !x.isInstanceOf[Text] /** trims an element - call this method, when you know that it is an * element (and not a text node) so you know that it will not be trimmed * away. With this assumption, the function can return a <code>Node</code>, * rather than a <code>Seq[Node]</code>. If you don't know, call * <code>trimProper</code> and account for the fact that you may get back * an empty sequence of nodes. * * precondition: node is not a text node (it might be trimmed) */ def trim(x: Node): Node = x match { case Elem(pre, lab, md, scp, child@_*) => Elem(pre, lab, md, scp, (child flatMap trimProper):_*) } /** trim a child of an element. <code>Attribute</code> values and * <code>Atom</code> nodes that are not <code>Text</code> nodes are unaffected. */ def trimProper(x:Node): Seq[Node] = x match { case Elem(pre,lab,md,scp,child@_*) => Elem(pre,lab,md,scp, (child flatMap trimProper):_*) case Text(s) => new TextBuffer().append(s).toText case _ => x } /** returns a sorted attribute list */ def sort(md: MetaData): MetaData = if((md eq Null) || (md.next eq Null)) md else { val key = md.key val smaller = sort(md.filter { m => m.key < key }) val greater = sort(md.filter { m => m.key > key }) smaller.append( Null ).append(md.copy ( greater )) } /** returns the node with its attribute list sorted alphabetically (prefixes are ignored) */ def sort(n:Node): Node = n match { case Elem(pre,lab,md,scp,child@_*) => Elem(pre,lab,sort(md),scp, (child map sort):_*) case _ => n } @deprecated("a string might also be Atom(s) - define your own conversion") def view(s: String): Text = Text(s) /** * Escapes the characters < > & and " from string. * * @param text ... * @return ... */ final def escape(text: String): String = sbToString(escape(text, _)) object Escapes { /** For reasons unclear escape and unescape are a long ways from being logical inverses. */ private val pairs = List( "lt" -> '<', "gt" -> '>', "amp" -> '&', "quot" -> '"' // enigmatic comment explaining why this isn't escaped -- // is valid xhtml but not html, and IE doesn't know it, says jweb // "apos" -> '\'' ) val escMap = Map((for ((s, c) <- pairs) yield (c, "&%s;" format s)) : _*) val unescMap = Map(("apos" -> '\'') :: pairs : _*) } import Escapes._ /** * Appends escaped string to <code>s</code>. * * @param text ... * @param s ... * @return ... */ final def escape(text: String, s: StringBuilder): StringBuilder = text.foldLeft(s)((s, c) => escMap.get(c) match { case Some(str) => s append str case None => s append c }) /** * Appends unescaped string to <code>s</code>, amp becomes & * lt becomes < etc.. * * @param ref ... * @param s ... * @return <code>null</code> if <code>ref</code> was not a predefined * entity. */ final def unescape(ref: String, s: StringBuilder): StringBuilder = (unescMap get ref) map (s append _) getOrElse null /** * Returns a set of all namespaces used in a sequence of nodes * and all their descendants, including the empty namespaces. * * @param nodes ... * @return ... */ def collectNamespaces(nodes: Seq[Node]): Set[String] = nodes.foldLeft(new HashSet[String]) { (set, x) => collectNamespaces(x, set) ; set } /** * Adds all namespaces in node to set. * * @param n ... * @param set ... */ def collectNamespaces(n: Node, set: Set[String]) { if (n.doCollectNamespaces) { set += n.namespace for (a <- n.attributes) a match { case _:PrefixedAttribute => set += a.getNamespace(n) case _ => } for (i <- n.child) collectNamespaces(i, set) } } // def toXML( // x: Node, // pscope: NamespaceBinding = TopScope, // sb: StringBuilder = new StringBuilder, // stripComments: Boolean = false, // decodeEntities: Boolean = true, // preserveWhitespace: Boolean = false, // minimizeTags: Boolean = false): String = // { // toXMLsb(x, pscope, sb, stripComments, decodeEntities, preserveWhitespace, minimizeTags) // sb.toString() // } def toXML( x: Node, pscope: NamespaceBinding = TopScope, sb: StringBuilder = new StringBuilder, stripComments: Boolean = false, decodeEntities: Boolean = true, preserveWhitespace: Boolean = false, minimizeTags: Boolean = false): StringBuilder = { x match { case c: Comment if !stripComments => c buildString sb case x: SpecialNode => x buildString sb case g: Group => for (c <- g.nodes) toXML(c, x.scope, sb) ; sb case _ => // print tag with namespace declarations sb.append('<') x.nameToString(sb) if (x.attributes ne null) x.attributes.buildString(sb) x.scope.buildString(sb, pscope) if (x.child.isEmpty && minimizeTags) // no children, so use short form: <xyz .../> sb.append(" />") else { // children, so use long form: <xyz ...>...</xyz> sb.append('>') sequenceToXML(x.child, x.scope, sb, stripComments) sb.append("</") x.nameToString(sb) sb.append('>') } } } def sequenceToXML( children: Seq[Node], pscope: NamespaceBinding = TopScope, sb: StringBuilder = new StringBuilder, stripComments: Boolean = false): Unit = { if (children.isEmpty) return else if (children forall isAtomAndNotText) { // add space val it = children.iterator val f = it.next toXML(f, pscope, sb) while (it.hasNext) { val x = it.next sb.append(' ') toXML(x, pscope, sb) } } else children foreach { toXML(_, pscope, sb) } } /** * Returns prefix of qualified name if any. * * @param name ... * @return ... */ final def prefix(name: String): Option[String] = (name indexOf ':') match { case -1 => None case i => Some(name.substring(0, i)) } /** * Returns a hashcode for the given constituents of a node * * @param uri * @param label * @param attribHashCode * @param children */ def hashCode(pre: String, label: String, attribHashCode: Int, scpeHash: Int, children: Seq[Node]) = ( ( if(pre ne null) {41 * pre.hashCode() % 7} else {0}) + label.hashCode() * 53 + attribHashCode * 7 + scpeHash * 31 + { var c = 0 val i = children.iterator while(i.hasNext) c = c * 41 + i.next.hashCode c } ) def appendQuoted(s: String): String = sbToString(appendQuoted(s, _)) /** * Appends "s" if string <code>s</code> does not contain ", * 's' otherwise. * * @param s ... * @param sb ... * @return ... */ def appendQuoted(s: String, sb: StringBuilder) = { val ch = if (s contains '"') '\'' else '"' sb.append(ch).append(s).append(ch) } /** * Appends "s" and escapes and " i s with \" * * @param s ... * @param sb ... * @return ... */ def appendEscapedQuoted(s: String, sb: StringBuilder): StringBuilder = { sb.append('"') for (c <- s) c match { case '"' => sb.append('\\'); sb.append('"') case _ => sb.append(c) } sb.append('"') } /** * @param s ... * @param index ... * @return ... */ def getName(s: String, index: Int): String = { if (index >= s.length) null else (s drop index) match { case Seq(x, xs @ _*) if isNameStart(x) => (Array(x) ++ (xs takeWhile isNameChar)).mkString case _ => "" } } /** * Returns <code>null</code> if the value is a correct attribute value, * error message if it isn't. * * @param value ... * @return ... */ def checkAttributeValue(value: String): String = { var i = 0 while (i < value.length) { value.charAt(i) match { case '<' => return "< not allowed in attribute value"; case '&' => val n = getName(value, i+1) if (n eq null) return "malformed entity reference in attribute value ["+value+"]"; i = i + n.length + 1 if (i >= value.length || value.charAt(i) != ';') return "malformed entity reference in attribute value ["+value+"]"; case _ => } i = i + 1 } null } /** * new * * @param value ... * @return ... */ def parseAttributeValue(value: String): Seq[Node] = { val zs: Seq[Char] = value val sb = new StringBuilder var rfb: StringBuilder = null val nb = new NodeBuffer() val it = zs.iterator while (it.hasNext) { var c = it.next c match { case '&' => // entity! flush buffer into text node it.next match { case '#' => c = it.next val theChar = parseCharRef ({ ()=> c },{ () => c = it.next },{s => throw new RuntimeException(s)}) sb.append(theChar) case x => if (rfb eq null) rfb = new StringBuilder() rfb.append(x) c = it.next while (c != ';') { rfb.append(c) c = it.next } val ref = rfb.toString() rfb.setLength(0) unescape(ref,sb) match { case null => if (sb.length > 0) { // flush buffer nb += Text(sb.toString()) sb.setLength(0) } nb += EntityRef(sb.toString()) // add entityref case _ => } } case x => sb.append(x) } } if (sb.length > 0) { // flush buffer val x = Text(sb.toString()) if (nb.length == 0) return x else nb += x } nb } /** * <pre> * CharRef ::= "&#" '0'..'9' {'0'..'9'} ";" * | "&#x" '0'..'9'|'A'..'F'|'a'..'f' { hexdigit } ";" * </pre> * <p> * see [66] * <p> * * @param ch ... * @param nextch ... * @param reportSyntaxError ... * @return ... */ def parseCharRef(ch: () => Char, nextch: () => Unit, reportSyntaxError: String => Unit): String = { val hex = (ch() == 'x') && { nextch(); true } val base = if (hex) 16 else 10 var i = 0 while (ch() != ';') { ch() match { case '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' => i = i * base + ch().asDigit case 'a' | 'b' | 'c' | 'd' | 'e' | 'f' | 'A' | 'B' | 'C' | 'D' | 'E' | 'F' => if (! hex) reportSyntaxError("hex char not allowed in decimal char ref\n" + "Did you mean to write &#x ?") else i = i * base + ch().asDigit case _ => reportSyntaxError("character '" + ch() + "' not allowed in char ref\n") } nextch() } new String(io.UTF8Codec.encode(i), "utf8") } }