Quantcast

Parsing byte sequences

classic Classic list List threaded Threaded
7 messages Options
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Parsing byte sequences

John Ky
Hi,

Anyone know how to use the parser combinators on byte arrays and InputStreams?  I want to write a parser that will parse bytes rather than characters.

Cheers,

-John
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Parsing byte sequences

Daniel Sobral
I don't have any experience with that, but are you aware that the input element is defined by the type "Elem", which you need to define when extending Parsers?

On Tue, Dec 29, 2009 at 11:36 AM, John Ky <[hidden email]> wrote:
Hi,

Anyone know how to use the parser combinators on byte arrays and InputStreams?  I want to write a parser that will parse bytes rather than characters.

Cheers,

-John



--
Daniel C. Sobral

I travel to the future all the time.
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Parsing byte sequences

Rex Kerr-2
In reply to this post by John Ky
If you just want it to work (as opposed to being incredibly efficient), then use
  scala.util.parsing.input.StreamReader( ... )
to create a parser-compatible reader from any Java reader.  (This uses apply(...) from the companion object; it's not a constructor call.)

To read an InputStream, use
  new java.io.InputStreamReader( myInputStream )
in place of the "..." above.

To read a byte array, wrap your byte array in a ByteArrayInputStream:
  new java.io.InputStreamReader( new java.io.ByteArrayInputStream( myArray ) )

In this case, though, if you really wanted/needed efficiency, you'd likely be better off by subclassing Reader[Char] to use a byte array as a buffer.  I don't know whether you'd need to wrap your byte array in something that made it obey the java.lang.CharSequence interface; if so, you could just do that directly and use scala.util.parsing.input.CharSequenceReader to get the reader for the parser.

  --Rex

On Tue, Dec 29, 2009 at 8:36 AM, John Ky <[hidden email]> wrote:
Hi,

Anyone know how to use the parser combinators on byte arrays and InputStreams?  I want to write a parser that will parse bytes rather than characters.

Cheers,

-John

Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Parsing byte sequences

Paul Phillips-3
In reply to this post by John Ky
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams?  I want to write a parser that will parse bytes rather
> than characters.

I happen to be working on a very entertaining project involving that at
this very moment.  I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.  
(Some of this is specific to my project.)


import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec

import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble

trait ParsersUtil extends Parsers {  
  lazy val anyElem: Parser[Elem]          = elem("anyElem", _ => true)
  def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
  def elemOf(xs: Elem*): Parser[Elem]     = elem("elemOf", xs contains _)
 
  def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
  def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
  def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
  def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}

case class ByteOffsetPosition(offset: Int) extends Position {
  final val line = 1
  def column = offset + 1
  def lineContents: String = ""
}

class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
  def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
  def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
  def this(str: String) = this(str.getBytes, 0)

  override def source = bytes map (_.toChar)
 
  def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
  def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
  def pos: Position = ByteOffsetPosition(offset)
  def atEnd = offset >= bytes.length
 
  def byteAt(n: Int) = bytes(n)
  def length = bytes.length - offset
 
  override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
  def take(n: Int): Seq[Byte] = bytes drop offset take n
 
  override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}

trait BinaryParsers extends Parsers with ParsersUtil {
  type Elem = Byte
 
  protected implicit def readerToByteReader(x: Input): ByteReader = x match {
    case br: ByteReader => br
    case _              => new ByteReader(x)
  }
  def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
  def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))
 
  lazy val byte: Parser[Byte] = anyElem
  lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
  lazy val u2: Parser[Int] = bytes(2) ^^ toInt
  lazy val u4: Parser[Int] = bytes(4) ^^ toInt
  lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
  lazy val u8: Parser[Long] = bytes(8) ^^ toLong
  lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble  
 
  def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
    if (n <= in.length) Success(in take n, in drop n)
    else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
  }

  def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
  def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}

--
Paul Phillips      | Before a man speaks it is always safe to assume
Everyman           | that he is a fool.  After he speaks, it is seldom
Empiricist         | necessary to assume it.
slap pi uphill!    |     -- H. L. Mencken
Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Parsing byte sequences

John Ky
Hi Paul,

Thanks for that.  It works well.  It would be nice if the Scala Library included some of this basic functionality.

If your work makes processing binary data easier, I'd certainly be interested in taking a look when you publish it.

Cheers,

-John

On Wed, Dec 30, 2009 at 3:07 AM, Paul Phillips <[hidden email]> wrote:
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams?  I want to write a parser that will parse bytes rather
> than characters.

I happen to be working on a very entertaining project involving that at
this very moment.  I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.
(Some of this is specific to my project.)


import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec

import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble

trait ParsersUtil extends Parsers {
 lazy val anyElem: Parser[Elem]          = elem("anyElem", _ => true)
 def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
 def elemOf(xs: Elem*): Parser[Elem]     = elem("elemOf", xs contains _)

 def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
 def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
 def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
 def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}

case class ByteOffsetPosition(offset: Int) extends Position {
 final val line = 1
 def column = offset + 1
 def lineContents: String = ""
}

class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
 def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
 def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
 def this(str: String) = this(str.getBytes, 0)

 override def source = bytes map (_.toChar)

 def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
 def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
 def pos: Position = ByteOffsetPosition(offset)
 def atEnd = offset >= bytes.length

 def byteAt(n: Int) = bytes(n)
 def length = bytes.length - offset

 override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
 def take(n: Int): Seq[Byte] = bytes drop offset take n

 override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}

trait BinaryParsers extends Parsers with ParsersUtil {
 type Elem = Byte

 protected implicit def readerToByteReader(x: Input): ByteReader = x match {
   case br: ByteReader => br
   case _              => new ByteReader(x)
 }
 def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
 def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))

 lazy val byte: Parser[Byte] = anyElem
 lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
 lazy val u2: Parser[Int] = bytes(2) ^^ toInt
 lazy val u4: Parser[Int] = bytes(4) ^^ toInt
 lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
 lazy val u8: Parser[Long] = bytes(8) ^^ toLong
 lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble

 def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
   if (n <= in.length) Success(in take n, in drop n)
   else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
 }

 def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
 def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}

--
Paul Phillips      | Before a man speaks it is always safe to assume
Everyman           | that he is a fool.  After he speaks, it is seldom
Empiricist         | necessary to assume it.
slap pi uphill!    |     -- H. L. Mencken

Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Parsing byte sequences

John Ky
In reply to this post by Paul Phillips-3
Hi Paul,

I had an issue with EOF, where EofCh.toByte was being consumed by my parser as 26.  My parsers therefore fail to parse the EOF properly.  I fixed this by using an exception like this:

  BinaryReader.scala:
  def first: Byte = {
    if (offset < bytes.length) {
      bytes(offset)
    } else {
      throw EofException
    }
  }

  BinaryParsers.scala:
  override def acceptIf(p: Elem => Boolean)(err: Elem => String): Parser[Elem] = Parser { in =>
    try {
      if (p(in.first)) {
        Success(in.first, in.rest)
      } else {
        Failure(err(in.first), in)
      }
    } catch {
      case e if e eq EofException => Failure("EOF unexpected", in)
    }
  }
 
  override def acceptMatch[U](expected: String, f: PartialFunction[Elem, U]): Parser[U] = Parser{ in =>
    try {
      if (f.isDefinedAt(in.first)) {
        Success(f(in.first), in.rest)
      } else {
        Failure(expected + " expected", in)
      }
    } catch {
      case e if e eq EofException => Failure("EOF unexpected: " + expected + " expected", in)
    }
  }

Because I reuse the EofException object all the time, it shouldn't be too expensive.

Cheers,

-John


2009/12/30 Paul Phillips <[hidden email]>
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams?  I want to write a parser that will parse bytes rather
> than characters.

I happen to be working on a very entertaining project involving that at
this very moment.  I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.
(Some of this is specific to my project.)


import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec

import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble

trait ParsersUtil extends Parsers {
 lazy val anyElem: Parser[Elem]          = elem("anyElem", _ => true)
 def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
 def elemOf(xs: Elem*): Parser[Elem]     = elem("elemOf", xs contains _)

 def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
 def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
 def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
 def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}

case class ByteOffsetPosition(offset: Int) extends Position {
 final val line = 1
 def column = offset + 1
 def lineContents: String = ""
}

class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
 def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
 def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
 def this(str: String) = this(str.getBytes, 0)

 override def source = bytes map (_.toChar)

 def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
 def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
 def pos: Position = ByteOffsetPosition(offset)
 def atEnd = offset >= bytes.length

 def byteAt(n: Int) = bytes(n)
 def length = bytes.length - offset

 override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
 def take(n: Int): Seq[Byte] = bytes drop offset take n

 override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}

trait BinaryParsers extends Parsers with ParsersUtil {
 type Elem = Byte

 protected implicit def readerToByteReader(x: Input): ByteReader = x match {
   case br: ByteReader => br
   case _              => new ByteReader(x)
 }
 def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
 def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))

 lazy val byte: Parser[Byte] = anyElem
 lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
 lazy val u2: Parser[Int] = bytes(2) ^^ toInt
 lazy val u4: Parser[Int] = bytes(4) ^^ toInt
 lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
 lazy val u8: Parser[Long] = bytes(8) ^^ toLong
 lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble

 def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
   if (n <= in.length) Success(in take n, in drop n)
   else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
 }

 def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
 def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}

--
Paul Phillips      | Before a man speaks it is always safe to assume
Everyman           | that he is a fool.  After he speaks, it is seldom
Empiricist         | necessary to assume it.
slap pi uphill!    |     -- H. L. Mencken

Reply | Threaded
Open this post in threaded view
|  
Report Content as Inappropriate

Re: Parsing byte sequences

Mushtaq Ahmed
In reply to this post by Paul Phillips-3
Sorry for commenting on an old thread, but BinaryParser looks really nice. Hope to see it in truck some day!

I guess final version will also have a way to specify little/big endianness, is that right?

Thanks,
Mushtaq


On Tue, Dec 29, 2009 at 9:37 PM, Paul Phillips <[hidden email]> wrote:
On Wed, Dec 30, 2009 at 12:36:07AM +1100, John Ky wrote:
> Anyone know how to use the parser combinators on byte arrays and
> InputStreams?  I want to write a parser that will parse bytes rather
> than characters.

I happen to be working on a very entertaining project involving that at
this very moment.  I'll publish this all pretty soon but here are a few
classes I use to make the standard library seem more byte oriented.
(Some of this is specific to my project.)


import scala.util.parsing.combinator._
import scala.util.parsing.input.{ Position, Reader }
import scala.util.parsing.input.CharArrayReader.EofCh
import scala.annotation.tailrec

import java.lang.Float.intBitsToFloat
import java.lang.Double.longBitsToDouble

trait ParsersUtil extends Parsers {
 lazy val anyElem: Parser[Elem]          = elem("anyElem", _ => true)
 def elemExcept(xs: Elem*): Parser[Elem] = elem("elemExcept", x => !(xs contains x))
 def elemOf(xs: Elem*): Parser[Elem]     = elem("elemOf", xs contains _)

 def take(n: Int): Parser[Seq[Elem]] = repN(n, anyElem)
 def takeUntil(cond: Parser[Elem]): Parser[Seq[Elem]] = takeUntil(cond, anyElem)
 def takeUntil(cond: Parser[Elem], p: Parser[Elem]): Parser[Seq[Elem]] = rep(not(cond) ~> p)
 def takeWhile(p: Parser[Elem]): Parser[Seq[Elem]] = rep(p)
}

case class ByteOffsetPosition(offset: Int) extends Position {
 final val line = 1
 def column = offset + 1
 def lineContents: String = ""
}

class ByteReader(val bytes: Array[Byte], override val offset: Int) extends Reader[Byte] {
 def this(reader: Reader[_]) = this(reader.source.toString.getBytes, 0)
 def this(bytes: Seq[Byte]) = this(bytes.toArray, 0)
 def this(str: String) = this(str.getBytes, 0)

 override def source = bytes map (_.toChar)

 def first: Byte = if (offset < bytes.length) bytes(offset) else EofCh.toByte
 def rest: ByteReader = if (offset < bytes.length) new ByteReader(bytes, offset + 1) else this
 def pos: Position = ByteOffsetPosition(offset)
 def atEnd = offset >= bytes.length

 def byteAt(n: Int) = bytes(n)
 def length = bytes.length - offset

 override def drop(n: Int): ByteReader = new ByteReader(bytes, offset + n)
 def take(n: Int): Seq[Byte] = bytes drop offset take n

 override def toString = "ByteReader(%d / %d)".format(offset, bytes.length)
}

trait BinaryParsers extends Parsers with ParsersUtil {
 type Elem = Byte

 protected implicit def readerToByteReader(x: Input): ByteReader = x match {
   case br: ByteReader => br
   case _              => new ByteReader(x)
 }
 def toInt(bytes: Seq[Byte]): Int = bytes.foldLeft(0)((x, b) => (x << 8) + (b & 0xFF))
 def toLong(bytes: Seq[Byte]): Long = bytes.foldLeft(0L)((x, b) => (x << 8) + (b & 0xFF))

 lazy val byte: Parser[Byte] = anyElem
 lazy val u1: Parser[Int] = byte ^^ (_ & 0xFF)
 lazy val u2: Parser[Int] = bytes(2) ^^ toInt
 lazy val u4: Parser[Int] = bytes(4) ^^ toInt
 lazy val u4f: Parser[Float] = u4 ^^ intBitsToFloat
 lazy val u8: Parser[Long] = bytes(8) ^^ toLong
 lazy val u8d: Parser[Double] = u8 ^^ longBitsToDouble

 def bytes(n: Int): Parser[Seq[Byte]] = Parser { in =>
   if (n <= in.length) Success(in take n, in drop n)
   else Failure("Requested %d bytes but only %d remain".format(n, in.length), in)
 }

 def parse[T](p: Parser[T], in: Input): ParseResult[T] = p(in)
 def parse[T](p: Parser[T], in: String): ParseResult[T] = parse(p, new ByteReader(in))
}

--
Paul Phillips      | Before a man speaks it is always safe to assume
Everyman           | that he is a fool.  After he speaks, it is seldom
Empiricist         | necessary to assume it.
slap pi uphill!    |     -- H. L. Mencken

Loading...