View Javadoc

1   package oqube.muse;
2   
3   import java.util.List;
4   import java.util.ArrayList;
5   import java.util.regex.Pattern;
6   import java.util.regex.Matcher;
7   import java.util.Iterator;
8   import fr.lifl.parsing.ParserListenerDelegate;
9   import java.io.Reader;
10  import fr.lifl.parsing.ParserConfiguration;
11  import fr.lifl.parsing.ParserListener;
12  import java.io.BufferedReader;
13  import java.util.Stack;
14  import fr.lifl.parsing.ParserException;
15  import fr.lifl.parsing.Parser;
16  import fr.lifl.parsing.ParserPosition;
17  import fr.lifl.parsing.Namespace;
18  import java.io.IOException;
19  import java.io.InputStream;
20  import java.io.InputStreamReader;
21  import org.apache.commons.logging.Log;
22  import org.apache.commons.logging.LogFactory;
23  
24  /***
25   * Muse parser implementation. Current implementation is closed but future
26   * evolutions shall allow some customization of parsing process.
27   * 
28   * @author abailly@oqube.muse.com
29   * @version $Id$
30   */
31  public class MuseParser implements Parser {
32  
33    private static final String EOL = System.getProperty("line.separator");
34  
35    /* delegate handling of listeners */
36    private ParserListenerDelegate delegate = new ParserListenerDelegate();
37  
38    /* character stream to use */
39    private Reader reader;
40  
41    /* buffer storing pre-parsed data */
42    private StringBuffer tagContent;
43  
44    /*
45     * hold current tag name in tag mode. This allows nesting of tags inside tags
46     */
47    private String currentTag;
48  
49    public String getCurrentTag() {
50      return currentTag;
51    }
52  
53    public void setCurrentTag(String currentTag) {
54      this.currentTag = currentTag;
55    }
56  
57    public Reader getReader() {
58      return reader;
59    }
60  
61    /* debug mode */
62    private boolean debug = false;
63  
64    /* log instance */
65    private static Log log = LogFactory.getLog(MuseParser.class);
66  
67    public static Log getLog() {
68      return log;
69    }
70  
71    public static void setLog(Log log) {
72      MuseParser.log = log;
73    }
74  
75    /* current state - always equal to top of stack */
76    private State state;
77  
78    /* stacked states */
79    private Stack /* < State > */states = new Stack();
80  
81    /* current sink */
82    private MuseSink sink;
83  
84    public MuseSink getSink() {
85      return sink;
86    }
87  
88    public void setSink(MuseSink sink) {
89      this.sink = strong;
90      strong.setWrappedSink(sink);
91    }
92  
93    /*
94     * push state/col pair as new state
95     */
96    private State push(int st, int col) {
97      states.push(state);
98  
99      if (state != null) {
100       StringBuffer sb = new StringBuffer();
101       for (int i = 0; i < col; i++)
102         sb.append(' ');
103       sb.append(state.state).append(" > ").append(st);
104       if (log.isDebugEnabled())
105         log.debug(sb);
106     }
107     return state = new State(st, col);
108   }
109 
110   /*
111    * Return old state
112    */
113   private State pop() {
114     State s = state;
115     state = (State) states.pop();
116     StringBuffer sb = new StringBuffer();
117     for (int i = 0; i < state.col; i++)
118       sb.append(' ');
119     sb.append(state.state).append(" < ").append(s.state);
120     if (log.isDebugEnabled()) {
121       log.debug(sb);
122     }
123     return s;
124   }
125 
126   /* current position - for error messages and reporting */
127   private ParserPosition pos;
128 
129   public ParserPosition getPos() {
130     return pos;
131   }
132 
133   public void setPos(ParserPosition pos) {
134     this.pos = pos;
135   }
136 
137   /***
138    * Handles paragraph liens
139    */
140   private AbstractLexer paralex = new AbstractLexer("^(//S+.*)$") {
141 
142     /*
143      * Rules for paragraph: - start paragraph if not in paragraph. First end
144      * enclosing env - continue paragraph if in paragraph
145      * 
146      */
147     public void handler() {
148       switch (state.state) {
149       case para:
150         sink.text(matcher.group(1));
151         break;
152       case top:
153         /* start new paragraph */
154         sink.startPara();
155         sink.text(matcher.group(1));
156         push(para, 0);
157         break;
158       case tab:
159         pop();
160         handler();
161         break;
162       case item:
163         pop();
164         sink.endItem();
165         handler();
166         break;
167       case list:
168         pop();
169         sink.endList();
170         handler();
171         break;
172       case enums:
173         pop();
174         sink.endEnums();
175         handler();
176         break;
177       case quote:
178         pop();
179         sink.endQuote();
180         handler();
181         break;
182       case table:
183         pop();
184         sink.endTable();
185         handler();
186         break;
187       case center:
188         pop();
189         sink.endCenter();
190         handler();
191         break;
192       case head:
193         pop();
194         sink.endHeader();
195         sink.startBody();
196         handler();
197         break;
198       case tag:
199         tagContent.append(matcher.group(0)).append(EOL);
200         return;
201       default:
202         throw new ParserException("Invalid paragraph line " + matcher.group(1)
203             + " @(" + pos + ")");
204       }
205     }
206   };
207 
208   abstract class ItemLexer extends AbstractLexer {
209 
210     ItemLexer(String pat) {
211       super(pat);
212     }
213 
214     protected void handle(int type) {
215       if (state.state == tag) {
216         tagContent.append(matcher.group(0)).append(EOL);
217         return;
218       }
219       int c = matcher.group(1).length();
220       if (c > state.col) {
221         // new sub list
222         push(type, c);
223         push(item, c);
224         if (type == list)
225           sink.startList();
226         else if (type == enums)
227           sink.startEnums();
228         // ask for item content to be aligned with first line
229         c += matcher.group(2).length();
230         push(tab, c);
231         sink.startItem();
232         sink.text(matcher.group(3));
233       } else if (c < state.col) {
234         State s = pop(); // old state
235         switch (s.state) {
236         case para:
237           sink.endPara();
238           break;
239         case head:
240           sink.endHeader();
241           sink.startBody();
242           break;
243         case tab:
244           break;
245         case item:
246           sink.endItem();
247           break;
248         case list:
249           sink.endList();
250           break;
251         case enums:
252           sink.endEnums();
253           break;
254         case quote:
255           sink.endQuote();
256           break;
257         case center:
258           sink.endCenter();
259         case table:
260           sink.endTable();
261           break;
262         default:
263           throw new ParserException("invalid indentation of list item "
264               + matcher.group(0) + " @(" + pos + ")");
265         }
266         // recurse
267         handler();
268       } else {
269         assert c == state.col;
270         while (state.state != item) {
271           switch (state.state) {
272           case tab:
273             pop();
274             handler();
275             return;
276           default:
277             throw new ParserException("Invalid indentation of list item "
278                 + matcher.group(0) + " at " + pos
279                 + ": should be at least one space further right");
280           }
281         }
282         sink.endItem();
283         c += matcher.group(2).length();
284         push(tab, c);
285         sink.startItem();
286         sink.text(matcher.group(3));
287       }
288     }
289   };
290 
291   /* handle unordered lists items */
292   private AbstractLexer listlex = new ItemLexer("(//s+)(-//s+)(.*)") {
293     public void handler() {
294       handle(list);
295     }
296   };
297 
298   /* handle ordered lists items */
299   private AbstractLexer enumlex = new ItemLexer("(//s+)(//d+//.//s+)(.*)") {
300     public void handler() {
301       handle(enums);
302     }
303   };
304 
305   /* handle lines starting with blanks */
306   private AbstractLexer blanklex = new AbstractLexer("(//s+)(//S.*)") {
307     public void handler() {
308       // count blanks
309       int ws = matcher.group(1).length();
310       switch (state.state) {
311       case top:
312       case item:
313       case para:
314       case tab:
315         if (ws >= 6 + state.col) {// center
316           sink.startCenter();
317           push(center, ws);
318         } else if (ws > state.col) {
319           sink.startQuote();
320           push(quote, ws);
321         }
322         if (ws < state.col)
323           throw new ParserException("Incorrect indentation line "
324               + matcher.group(1) + "in state " + state + " @(" + pos + ")");
325         sink.text(matcher.group(2));
326         break;
327       case quote:
328       case center:
329         if (ws < state.col)
330           throw new ParserException("Incorrect indentation line "
331               + matcher.group(1) + "in state " + state + " @(" + pos + ")");
332         sink.text(matcher.group(2));
333         break;
334       case head:
335         pop();
336         sink.endHeader();
337         sink.startBody();
338         handler();
339         break;
340       case tag:
341         tagContent.append(matcher.group(0)).append(EOL);
342         return;
343       default:
344         throw new ParserException("Unexpected whitespace starting line "
345             + matcher.group(1) + "in state " + state + " @(" + pos + ")");
346       }
347     }
348   };
349 
350   /*
351    * handle empty lines Empty liens terminate current block(s) and return parser
352    * to toplevel state
353    */
354   private AbstractLexer emptylex = new AbstractLexer("^//s*$") {
355     public void handler() {
356       while (state.state != top) {
357         assert !states.isEmpty();
358         // update state
359         switch (state.state) {
360         case para:
361           sink.endPara();
362           break;
363         case list:
364           sink.endList();
365           break;
366         case quote:
367           sink.endQuote();
368           break;
369         case center:
370           sink.endCenter();
371           break;
372         case enums:
373           sink.endEnums();
374           break;
375         case item:
376           sink.endItem();
377           break;
378         case head:
379           sink.endHeader();
380           sink.startBody();
381           break;
382         case tab: // NOP
383           break;
384         case table:
385           sink.endTable();
386           break;
387         case tag:
388           tagContent.append(EOL).append(EOL);
389           return;
390         default:
391           throw new ParserException("Unexpected empty line in state " + state
392               + " @" + pos + "");
393         }
394         pop();
395       }
396       assert state.state == top;
397     }
398   };
399 
400   /* handle headers */
401   private AbstractLexer headerlex = new AbstractLexer("^(//*+)//s+(//S+.*)$") {
402 
403     public void handler() {
404       switch (state.state) {
405       case head:
406         pop();
407         sink.endHeader();
408         sink.startBody();
409         break;
410       case tag:
411         tagContent.append(matcher.group(0)).append(EOL);
412         return;
413       }
414       if (state.state != top)
415         throw new ParserException("Cannot use header inside blocks: "
416             + matcher.group(0) + " in state " + state.state + " @" + pos + "");
417       /* count levels */
418       int lvl = matcher.group(1).length();
419       switch (lvl) {
420       case 1:
421         sink.startTitle1();
422         sink.text(matcher.group(2));
423         sink.endTitle1();
424         break;
425       case 2:
426         sink.startTitle2();
427         sink.text(matcher.group(2));
428         sink.endTitle2();
429         break;
430       case 3:
431         sink.startTitle3();
432         sink.text(matcher.group(2));
433         sink.endTitle3();
434         break;
435       default: /* only four levels */
436         sink.startTitle4();
437         sink.text(matcher.group(2));
438         sink.endTitle4();
439         break;
440       }
441     }
442   };
443 
444   /* handle headers */
445   private AbstractLexer metalex = new AbstractLexer("^#(//S+)(:?//s+(.*))?$") {
446 
447     public void handler() {
448       switch (state.state) {
449       case head:
450         /* store key/value pair in sink */
451         sink.addMetadata(matcher.group(1), matcher.group(3));
452         break;
453       case tag:
454         tagContent.append(matcher.group(0)).append(EOL);
455         return;
456       default:
457         // group 2 should be empty and group 1 denotes an anchor
458         if (matcher.group(3) == null || "".equals(matcher.group(3)))
459           sink.anchor(matcher.group(1));
460         else
461           throw new ParserException("Cannot use meta data inside blocks: "
462               + matcher.group(0) + " @(" + pos + ")");
463       }
464     }
465   };
466 
467   /* handle separator - at least four dashes */
468   private AbstractLexer sepalex = new AbstractLexer("^----+") {
469 
470     public void handler() {
471       switch (state.state) {
472       case top:
473         sink.separator();
474         break;
475       case tag:
476         tagContent.append(matcher.group(0));
477         return;
478       default:
479         throw new ParserException("Cannot use separator inside blocks: "
480             + matcher.group(0) + " @" + pos + "");
481       }
482     }
483   };
484 
485   /* handle block tags - can use tags inside tags */
486   private AbstractLexer sttaglex = new AbstractLexer("^<(//w+)>//s*$") {
487 
488     public void handler() {
489       String tg = matcher.group(1);
490       // check nested tags
491       if (state.state == tag) {
492         tagContent.append(matcher.group(0)).append(EOL);
493         return;
494       }
495       setCurrentTag(tg);
496       push(tag, 0);
497       tagContent = new StringBuffer();
498     }
499   };
500 
501   private AbstractLexer endtaglex = new AbstractLexer("^</(//w+)>//s*$") {
502 
503     public void handler() {
504       String tg = matcher.group(1);
505       // check nested tags
506       if (!getCurrentTag().equals(tg)) {
507         tagContent.append(matcher.group(0)).append(EOL);
508         return;
509       }
510       if (state.state != tag)
511         throw new ParserException("Found ending tag " + tag + " @" + pos);
512       pop();
513       /* store key/value pair in sink */
514       sink.block(tg, tagContent.toString());
515     }
516   };
517 
518   private AbstractLexer tableHdrLex = new AbstractLexer("^[^|]+(//|//|[^|]+)+$") {
519 
520     public void handler() {
521       String th = matcher.group(0);
522       if (state.state == tag) {
523         tagContent.append(matcher.group(0)).append(EOL);
524         return;
525       }
526       if (state.state == table)
527         throw new ParserException("Cannot nest headers inside table");
528       push(table, state.col);
529       sink.startTable();
530       /* parse headers text */
531       sink.startTableRow();
532       String[] hdrs = th.split("//|//|");
533       for (int i = 0; i < hdrs.length; i++) {
534         sink.startTableHeader();
535         sink.text(hdrs[i]);
536         sink.endTableHeader();
537       }
538       sink.endTableRow();
539     }
540 
541   };
542 
543   private AbstractLexer tableDataLex = new AbstractLexer("^[^|]+(//|[^|]*)+$") {
544 
545     public void handler() {
546       String th = matcher.group(0);
547       if (state.state == tag) {
548         tagContent.append(matcher.group(0)).append(EOL);
549         return;
550       }
551       if (state.state != table) {
552         // need not have headers
553         push(table, state.col);
554         sink.startTable();
555       }
556       /* parse headers text */
557       sink.startTableRow();
558       String[] hdrs = th.split("//|");
559       for (int i = 0; i < hdrs.length; i++) {
560         sink.startTableData();
561         sink.text(hdrs[i]);
562         sink.endTableData();
563       }
564       sink.endTableRow();
565     }
566 
567   };
568 
569   /* states */
570   private static final int top = 0;
571 
572   private static final int para = 1;
573 
574   private static final int list = 2;
575 
576   private static final int item = 3;
577 
578   private static final int quote = 4;
579 
580   private static final int center = 5;
581 
582   private static final int enums = 6;
583 
584   private static final int tab = 7; // for aligning text inside lists
585 
586   private static final int head = 8; // in header
587 
588   private static final int foot = 9; // in footer
589 
590   private static final int tag = 10; // in tag
591 
592   private static final int table = 11; // in table
593 
594   // private static final int enums = blocks.add("(//s+)//d+//.//s+(.*)");
595 
596   /* patterns for formatting */
597   private FlowLexer emph = new FlowLexer("//*([^*]+)//*") {
598     public void handler() {
599       startEmph();
600       /* pass it over to next lexer */
601       assert next != null;
602       ((FlowLexer) next).format(matcher.group(1));
603       endEmph();
604     }
605   };
606 
607   private FlowLexer strong = new FlowLexer("//*//*([^*]+)//*//*") {
608     public void handler() {
609       startStrong();
610       /* pass it over to next lexer */
611       assert next != null;
612       ((FlowLexer) next).format(matcher.group(1));
613       endStrong();
614     }
615   };
616 
617   private FlowLexer verb = new FlowLexer("//=([^=]+)//=") {
618     public void handler() {
619       startVerb();
620       /* pass it over to next lexer */
621       assert next != null;
622       ((FlowLexer) next).format(matcher.group(1));
623       endVerb();
624     }
625   };
626 
627   private FlowLexer uline = new FlowLexer("_([^_]+)_") {
628     public void handler() {
629       startUline();
630       /* pass it over to next lexer */
631       assert next != null;
632       ((FlowLexer) next).format(matcher.group(1));
633       endUline();
634     }
635   };
636 
637   private FlowLexer link = new FlowLexer(
638       "//[//[([^]]*)//](?://[([^]]*)//])?//]") {
639     public void handler() {
640       link(matcher.group(1), matcher.group(2));
641     }
642   };
643 
644   /***
645    * Sets the configuration object to be used by this parser
646    * 
647    * @param config
648    */
649   public void setParserConfiguration(ParserConfiguration config) {
650   }
651 
652   /***
653    * Sets the reader object providing the stream of characters to parse. The
654    * reader is not closed by this method
655    * 
656    * @param reader
657    *          a valid Reader object
658    */
659   public void setReader(Reader reader) {
660     this.reader = reader;
661   }
662 
663   /***
664    * Set an input stream to be parsed by this parser. This input stream will be
665    * wrapped in a reader using default locale settings.
666    * 
667    * @param is
668    *          the stream. may not be null.
669    */
670   public void setStream(InputStream is) {
671     this.reader = new InputStreamReader(is);
672   }
673 
674   /***
675    * Adds a listener for parse events to this Parser. Parsing events are
676    * generated by the Parser to notify listeners of warnings and recoverable
677    * errors. Unrecoverable errors are notified through
678    * {@see fr.lifl.util.ParserException}.
679    * 
680    * @param listener
681    *          the listener to add to this parser
682    */
683   public void addParserListener(ParserListener listener) {
684     delegate.addParserListener(listener);
685   }
686 
687   /***
688    * Gives information to this parser that parsing starts at given position in
689    * the enclosing context. If this method is not called prior to a call to
690    * {@see #start()} method, start position is assumed to be line 1, column 1.
691    * 
692    * @param pos
693    *          the start position - may not be null
694    */
695   public void setStartPosition(ParserPosition pos) {
696     this.pos = pos;
697   }
698 
699   /***
700    * Gives the Parser information of the enclosing Namespace this parsing is
701    * part of.
702    * 
703    */
704   public void setStartScope(Namespace scope) {
705   }
706 
707   /***
708    * Default Constructor.
709    */
710   public MuseParser() {
711     // link block lexers
712     headerlex.setNext(tableHdrLex).setNext(tableDataLex).setNext(sttaglex)
713         .setNext(endtaglex).setNext(emptylex).setNext(sepalex).setNext(listlex)
714         .setNext(enumlex).setNext(blanklex).setNext(metalex).setNext(paralex);
715     // link flow lexers
716     strong.setNext(emph).setNext(verb).setNext(uline).setNext(link).setNext(
717         IdentityLexer.instance);
718   }
719 
720   /***
721    * Asks this Parser to start parsing. This method is normally blocking and
722    * Parser should return when finished. This method must be called after a call
723    * to {@see #setReader(java.io.Reader)} or else it will throw immediatly a
724    * ParserException.
725    * <p>
726    * Recoverable parse events are notified through registered ParserListener
727    * interface, while non recoverable errors throw a PArserException.
728    * 
729    * @throws ParserException
730    */
731   public void start() throws ParserException {
732     String line = null;
733     BufferedReader br = new BufferedReader(reader);
734     if (pos == null)
735       pos = new ParserPosition(1, 1);
736     else {
737       pos.setLine(1);
738       pos.setColumn(1);
739     }
740     push(top, 0);
741     push(head, 0);
742     sink.startHeader();
743     /*
744      * parse lines to create blocks then parse blocks to add formatting and
745      * resolve link
746      */
747     try {
748       while ((line = br.readLine()) != null) {
749         parse(line);
750         pos.setLine(pos.getLine() + 1);
751       }
752       // call empty lex at end
753       if (state.state != top)
754         emptylex.handler();
755       assert state.state == top;
756       sink.endBody();
757     } catch (IOException e) {
758       throw new ParserException(e);
759     }
760   }
761 
762   /***
763    * @param line
764    */
765   public void parse(String line) {
766     if (log.isDebugEnabled())
767       log.debug("Parsing line " + line + " @" + pos);
768     headerlex.parse(line);
769     if (log.isDebugEnabled())
770       log.debug("Done");
771   }
772 
773 }