parser.go 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476
  1. package sexp
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "strconv"
  7. )
  8. // Parses S-expressions from a given io.RuneReader.
  9. //
  10. // Returned node is a virtual list node with all the S-expressions read from
  11. // the stream as children. In case of a syntax error, the returned error is not
  12. // nil.
  13. //
  14. // It's worth explaining where do you get *SourceFile from. The typical way to
  15. // create it is:
  16. // var ctx SourceContext
  17. // f := ctx.AddFile(filename, length)
  18. //
  19. // And you'll be able to use ctx later for decoding source location
  20. // information. It's ok to provide -1 as length if it's unknown. In that case
  21. // though you won't be able to add more files to the given SourceContext until
  22. // the file with unknown length is finalized, which happens when parsing is
  23. // finished.
  24. //
  25. // Also f is optional, nil is a perfectly valid argument for it, in that case
  26. // it will create a temporary context and add an unnamed file to it. Less setup
  27. // work is required, but you lose the ability to decode error source code
  28. // locations.
  29. func Parse(r io.RuneReader, f *SourceFile) (*Node, error) {
  30. var ctx SourceContext
  31. if f == nil {
  32. f = ctx.AddFile("", -1)
  33. }
  34. var p parser
  35. p.r = r
  36. p.f = f
  37. p.last_seq = seq{offset: -1}
  38. p.expect_eof = true
  39. return p.parse()
  40. }
  41. // Parses a single S-expression node from a stream.
  42. //
  43. // Returns just one node, be it a value or a list, doesn't touch the rest of
  44. // the data. In case of a syntax error, the returned error is not nil.
  45. //
  46. // Note that unlike Parse it requires io.RuneScanner. It's a technical
  47. // requirement, because in some cases s-expressions syntax delimiter is not
  48. // part of the s-expression value, like in a very simple example: "x y". "x"
  49. // here will be returned as a value Node, but " y" should remain untouched,
  50. // however without reading the space character we can't tell if this is the end
  51. // of "x" or not. Hence the requirement of being able to unread one rune.
  52. //
  53. // It's unclear what to do about error reporting for S-expressions read from
  54. // the stream. The usual idea of lines and columns doesn't apply here. Hence if
  55. // you do want to report errors gracefully some hacks will be necessary to do
  56. // so.
  57. //
  58. // NOTE: Maybe ParseOne will be changed in future to better serve the need of
  59. // good error reporting.
  60. func ParseOne(r io.RuneScanner, f *SourceFile) (*Node, error) {
  61. var ctx SourceContext
  62. if f == nil {
  63. f = ctx.AddFile("", -1)
  64. }
  65. var p parser
  66. p.r = r
  67. p.rs = r
  68. p.f = f
  69. p.last_seq = seq{offset: -1}
  70. p.expect_eof = true
  71. return p.parse_one_node()
  72. }
  73. // This error structure is Parse* functions family specific, it returns information
  74. // about errors encountered during parsing. Location can be decoded using the
  75. // context you passed in as an argument. If the context was nil, then the location
  76. // is simply a byte offset from the beginning of the input stream.
  77. type ParseError struct {
  78. Location SourceLoc
  79. message string
  80. }
  81. // Satisfy the built-in error interface. Returns the error message (without
  82. // source location).
  83. func (e *ParseError) Error() string {
  84. return e.message
  85. }
  86. var seq_delims = map[rune]rune{
  87. '(': ')',
  88. '`': '`',
  89. '"': '"',
  90. }
  91. func is_hex(r rune) bool {
  92. return (r >= '0' && r <= '9') ||
  93. (r >= 'a' && r <= 'f') ||
  94. (r >= 'A' && r <= 'F')
  95. }
  96. func is_space(r rune) bool {
  97. return r == ' ' || r == '\t' || r == '\n' || r == '\r'
  98. }
  99. func is_delimiter(r rune) bool {
  100. return is_space(r) || r == ')' || r == ';' || r == 0
  101. }
  102. type seq struct {
  103. offset int
  104. rune rune
  105. }
  106. type delim_state struct {
  107. last_seq seq
  108. expect_eof bool
  109. }
  110. type parser struct {
  111. r io.RuneReader
  112. rs io.RuneScanner
  113. f *SourceFile
  114. buf bytes.Buffer
  115. offset int
  116. cur rune
  117. curlen int
  118. delim_state
  119. }
  120. func (p *parser) advance_delim_state() delim_state {
  121. s := p.delim_state
  122. p.last_seq = seq{p.offset, p.cur}
  123. p.expect_eof = false
  124. return s
  125. }
  126. func (p *parser) restore_delim_state(s delim_state) {
  127. p.delim_state = s
  128. }
  129. func (p *parser) error(loc SourceLoc, format string, args ...interface{}) {
  130. panic(&ParseError{
  131. Location: loc,
  132. message: fmt.Sprintf(format, args...),
  133. })
  134. }
  135. func (p *parser) next() {
  136. p.offset += p.curlen
  137. r, s, err := p.r.ReadRune()
  138. if err != nil {
  139. if err == io.EOF {
  140. if p.expect_eof {
  141. p.cur = 0
  142. p.curlen = 0
  143. return
  144. }
  145. p.error(p.f.Encode(p.last_seq.offset),
  146. "missing matching sequence delimiter '%c'",
  147. seq_delims[p.last_seq.rune])
  148. }
  149. p.error(p.f.Encode(p.offset),
  150. "unexpected read error: %s", err)
  151. }
  152. p.cur = r
  153. p.curlen = s
  154. if r == '\n' {
  155. p.f.AddLine(p.offset + p.curlen)
  156. }
  157. }
  158. func (p *parser) skip_spaces() {
  159. for {
  160. if is_space(p.cur) {
  161. p.next()
  162. } else {
  163. return
  164. }
  165. }
  166. panic("unreachable")
  167. }
  168. func (p *parser) skip_comment() {
  169. for {
  170. // there was an EOF, return
  171. if p.cur == 0 {
  172. return
  173. }
  174. // read until '\n'
  175. if p.cur != '\n' {
  176. p.next()
  177. } else {
  178. // skip '\n' and return
  179. p.next()
  180. return
  181. }
  182. }
  183. panic("unreachable")
  184. }
  185. func (p *parser) parse_node() *Node {
  186. again:
  187. // the convention is that this function is called on a non-space `p.cur`
  188. switch p.cur {
  189. case ')':
  190. return nil
  191. case '(':
  192. return p.parse_list()
  193. case '"':
  194. return p.parse_string()
  195. case '`':
  196. return p.parse_raw_string()
  197. case ';':
  198. // skip comment
  199. p.skip_comment()
  200. p.skip_spaces()
  201. goto again
  202. case 0:
  203. // delayed expected EOF
  204. panic(io.EOF)
  205. default:
  206. return p.parse_ident()
  207. }
  208. panic("unreachable")
  209. }
  210. func (p *parser) parse_list() *Node {
  211. loc := p.f.Encode(p.offset)
  212. save := p.advance_delim_state()
  213. head := &Node{Location: loc}
  214. p.next() // skip opening '('
  215. var lastchild *Node
  216. for {
  217. p.skip_spaces()
  218. if p.cur == ')' {
  219. // skip enclosing ')', but it could be EOF also
  220. p.restore_delim_state(save)
  221. p.next()
  222. return head
  223. }
  224. node := p.parse_node()
  225. if node == nil {
  226. continue
  227. }
  228. if head.Children == nil {
  229. head.Children = node
  230. } else {
  231. lastchild.Next = node
  232. }
  233. lastchild = node
  234. }
  235. panic("unreachable")
  236. }
  237. func (p *parser) parse_esc_seq() {
  238. loc := p.f.Encode(p.offset)
  239. p.next() // skip '\\'
  240. switch p.cur {
  241. case 'a':
  242. p.next()
  243. p.buf.WriteByte('\a')
  244. case 'b':
  245. p.next()
  246. p.buf.WriteByte('\b')
  247. case 'f':
  248. p.next()
  249. p.buf.WriteByte('\f')
  250. case 'n':
  251. p.next()
  252. p.buf.WriteByte('\n')
  253. case 'r':
  254. p.next()
  255. p.buf.WriteByte('\r')
  256. case 't':
  257. p.next()
  258. p.buf.WriteByte('\t')
  259. case 'v':
  260. p.next()
  261. p.buf.WriteByte('\v')
  262. case '\\':
  263. p.next()
  264. p.buf.WriteByte('\\')
  265. case '"':
  266. p.next()
  267. p.buf.WriteByte('"')
  268. default:
  269. switch p.cur {
  270. case 'x':
  271. p.next() // skip 'x'
  272. p.parse_hex_rune(2)
  273. case 'u':
  274. p.next() // skip 'u'
  275. p.parse_hex_rune(4)
  276. case 'U':
  277. p.next() // skip 'U'
  278. p.parse_hex_rune(8)
  279. default:
  280. p.error(loc, `unrecognized escape sequence within '"' string`)
  281. }
  282. }
  283. }
  284. func (p *parser) parse_hex_rune(n int) {
  285. if n > 8 {
  286. panic("hex rune is too large")
  287. }
  288. var hex [8]byte
  289. p.next_hex(hex[:n])
  290. r, err := strconv.ParseUint(string(hex[:n]), 16, n*4) // 4 bits per hex digit
  291. panic_if_error(err)
  292. if n == 2 {
  293. p.buf.WriteByte(byte(r))
  294. } else {
  295. p.buf.WriteRune(rune(r))
  296. }
  297. }
  298. func (p *parser) next_hex(s []byte) {
  299. for i, n := 0, len(s); i < n; i++ {
  300. if !is_hex(p.cur) {
  301. loc := p.f.Encode(p.offset)
  302. p.error(loc, `'%c' is not a hex digit`, p.cur)
  303. }
  304. s[i] = byte(p.cur)
  305. p.next()
  306. }
  307. }
  308. func (p *parser) parse_string() *Node {
  309. loc := p.f.Encode(p.offset)
  310. save := p.advance_delim_state()
  311. p.next() // skip opening '"'
  312. for {
  313. switch p.cur {
  314. case '\n':
  315. p.error(loc, `newline is not allowed within '"' strings`)
  316. case '\\':
  317. p.parse_esc_seq()
  318. case '"':
  319. node := &Node{
  320. Location: loc,
  321. Value: p.buf.String(),
  322. }
  323. p.buf.Reset()
  324. // consume enclosing '"', could be EOF
  325. p.restore_delim_state(save)
  326. p.next()
  327. return node
  328. default:
  329. p.buf.WriteRune(p.cur)
  330. p.next()
  331. }
  332. }
  333. panic("unreachable")
  334. }
  335. func (p *parser) parse_raw_string() *Node {
  336. loc := p.f.Encode(p.offset)
  337. save := p.advance_delim_state()
  338. p.next() // skip opening '`'
  339. for {
  340. if p.cur == '`' {
  341. node := &Node{
  342. Location: loc,
  343. Value: p.buf.String(),
  344. }
  345. p.buf.Reset()
  346. // consume enclosing '`', could be EOF
  347. p.restore_delim_state(save)
  348. p.next()
  349. return node
  350. } else {
  351. p.buf.WriteRune(p.cur)
  352. p.next()
  353. }
  354. }
  355. panic("unreachable")
  356. }
  357. func (p *parser) parse_ident() *Node {
  358. loc := p.f.Encode(p.offset)
  359. for {
  360. if is_delimiter(p.cur) {
  361. node := &Node{
  362. Location: loc,
  363. Value: p.buf.String(),
  364. }
  365. p.buf.Reset()
  366. return node
  367. } else {
  368. p.buf.WriteRune(p.cur)
  369. p.next()
  370. }
  371. }
  372. panic("unreachable")
  373. }
  374. func (p *parser) parse() (root *Node, err error) {
  375. defer func() {
  376. if e := recover(); e != nil {
  377. p.f.Finalize(p.offset)
  378. if e == io.EOF {
  379. return
  380. }
  381. if sexperr, ok := e.(*ParseError); ok {
  382. root = nil
  383. err = sexperr
  384. return
  385. }
  386. panic(e)
  387. }
  388. }()
  389. root = new(Node)
  390. p.next()
  391. // don't worry, will eventually panic with io.EOF :D
  392. var lastchild *Node
  393. for {
  394. p.skip_spaces()
  395. node := p.parse_node()
  396. if node == nil {
  397. p.error(p.f.Encode(p.offset),
  398. "unexpected ')' at the top level")
  399. }
  400. if root.Children == nil {
  401. root.Children = node
  402. } else {
  403. lastchild.Next = node
  404. }
  405. lastchild = node
  406. }
  407. panic("unreachable")
  408. }
  409. func (p *parser) parse_one_node() (node *Node, err error) {
  410. defer func() {
  411. if e := recover(); e != nil {
  412. p.f.Finalize(p.offset)
  413. if e == io.EOF {
  414. return
  415. }
  416. if sexperr, ok := e.(*ParseError); ok {
  417. node = nil
  418. err = sexperr
  419. return
  420. }
  421. panic(e)
  422. }
  423. }()
  424. p.next()
  425. p.skip_spaces()
  426. node = p.parse_node()
  427. if node == nil {
  428. p.error(p.f.Encode(p.offset),
  429. "unexpected ')' at the top level")
  430. }
  431. err = p.rs.UnreadRune()
  432. return
  433. }