|
|
HTML Parser Example
// =================================================================
// ProGrammar Grammar Definition File
// -----------------------------------------------------------------
//
// Markup.GDL - Defines a general syntax for parsing documents
// that are marked-up by tags. This grammar accepts
// any tag name, as long as the beginning and ending
// tags match. Specific markup languages, such as
// HTML, can extend and constrain this grammar.
//
// This example is intended for demonstration purposes only.
//
// (c) Copyright 1999, 2000 NorKen Technologies, Inc.
// All rights reserved.
//
// ==================================================================
grammar Markup
{
text ::= { section };
section ::=
script
| start_tag [{ section }] [end_tag]
| comment
| doctype
| plain_text
;
plain_text ::= *("<"); // parse everything up to the next tag
comment ::= "") "-->";
doctype ::= "" ;
script ::= start_script
*(end_script) end_script ;
start_script ::= "<" "script" *(">") ">" ;
end_script ::= "<" "/" "script" ">" ;
doctype_info ::= *(">");
start_tag ::= "<"
element_name
[{ tag_attr }] // optional tag attributes
">"
;
element_name ::= identifier;
tag_attr ::= attr_name ["=" attr_value_string];
attr_name ::= '[a-zA-Z0-9-]+' ;
// There are three different ways to delimit an attribute
// value: with double-quotes, single quotes, or not delimited.
attr_value_string ::= "\"" attr_value "\""
| "\'" attr_value "\'"
| attr_value
;
// Regardless of how the attribute value is delimited, we
// are only interested in the "value itself". The following
// production shows how a nonterminal symbol can be parsed
// in different ways, depending upon the context in which it
// occurs. For example, when attr_value occurs in
// double-quotes, the parser will scan all characters up to
// the closing quote; but when it occurs without quotations,
// the parser scans all characters up to the next whitespace
// or closing tag ('>') character. Hence, symbol attr_value
// is "polymorphic".
attr_value ::=
(? #VALUE == "\"") *("\"") // scan to closing dbl-quote
| (? #VALUE == "\'") *("\'") // scan to closing single-quote
| '[^\32\t\n\r>]+' // scan to next whitespace or '>'
;
end_tag ::= ""
end_tag_name (? #VALUE == ^^section.start_tag.element_name)
">";
// The constraint "(? #VALUE = ^^section.start_tag.element_name)"
// enforces the rule that end_tags must have the same name as their
// corresponding start_tag.
end_tag_name ::= alphanumeric;
};
// =================================================================
// ProGrammar Grammar Definition File
// -----------------------------------------------------------------
//
// Html.GDL - Parses HTML documents; extends the "Markup" grammar.
//
// This example is intended for demonstration purposes only.
//
// (c) Copyright 1999, 2000 NorKen Technologies, Inc.
// All rights reserved.
//
// ==================================================================
grammar HTML extends Markup // extend the 'Markup' grammar
{
text ::= Markup.text; // start symbol
// Constrain the possible values for "element_name" to those
// defined in HTML 3.2
element_name ::=
Markup.element_name
(? #WARN = 100,
#VALUE ::= html32_tag; ) ; // constraint
// define valid HTML tag names
html32_tag ::=
( "A" | "ADDRESS" | "AREA" | "B"
| "BASE" | "BASEFONT" | "BGSOUND" | "BIG"
| "BLINK" | "BLOCKQUOTE" | "BODY" | "BR"
| "CAPTION" | "CENTER" | "CITE" | "CODE"
| "COL"
| "COLGROUP" | "COMMENT" | "DFN" | "DIR"
| "DIV" | "DL" | "DT" | "EM"
| "FONT"
| "FORM" | "FRAME" | "FRAMESET" | "HEAD"
| "HR" | "HTML" | "I" | "IMG"
| "INPUT" | "ISINDEX" | "KBD" | "LI"
| "LINK" | "LISTING" | "MAP" | "MARQUEE"
| "MENU" | "META" | "NEXTID" | "NOBR"
| "NOFRAMES" | "OL" | "OPTION" | "P"
| "PLAINTEXT" | "PRE" | "SAMP" | "SCRIPT"
| "SELECT" | "SMALL" | "STRIKE" | "STRONG"
| "SUB" | "SUP" | "TABLE" | "TBODY"
| "TD" | "TEXTAREA" | "TFOOT" | "TH"
| "THEAD" | "TITLE" | "TR" | "TT"
| "U" | "UL" | "VAR" | "WBR"
| "XMP"
)
"H" numeric<1>
;
grammar Attrs
{
a_attrs ::=
"HREF" | "METHODS" | "NAME" | "REL"
| "REV" | "TARGET" | "TITLE" | "URN" ;
area_attrs ::=
"SHAPE" | "COORDS" | "HREF" ;
basefont_attrs ::=
"FACE" | "COLOR" ;
bgsound_attrs ::=
"SRC" | "LOOP" ;
body_attrs ::=
"BACKGROUND" | "BGCOLOR" | "BGPROPERTIES"
| "LEFTMARGIN" | "LINK" | "VLINK" | "ALINK"
| "TEXT" | "TOPMARGIN" | "STYLESRC"
| "MARGINWIDTH" | "MARGINHEIGHT" ;
base_attrs ::= "HREF" | "TARGET" ;
caption_attrs ::= "ALIGN" | "VALIGN" ;
col_attrs ::= "ALIGN" | "SPAN" ;
colgroup_attrs ::= "ALIGN" | "SPAN" | "VALIGN" ;
font_attrs ::= "COLOR" | "FACE" | "SIZE" ;
form_attrs ::= "ACTION" | "ENCTYPE" | "METHOD" ;
frame_attrs ::=
"SRC" | "NAME" | "MARGINWIDTH" | "MARGINHEIGHT"
| "SCROLLING" | "NORESIZE" | "FRAMEBORDER"
| "FRAMESPACING" ;
frameset_attrs ::=
"ROWS" | "COLS" | frame_attrs;
hr_attrs ::=
"ALIGN" | "COLOR" | "NOSHADE" | "SIZE" | "WIDTH" ;
input_attrs ::=
"ALIGN" | "CHECKED" | "MAXLENGTH" | "NAME" | "SIZE"
| "SRC" | "TYPE" | "BUTTON" | "CHECKBOX" | "FILE"
| "HIDDEN" | "IMAGE" | "PASSWORD" | "RADIO"
| "RESET" | "SUBMIT" | "TEXT" | "TEXTAREA" | "VALUE" ;
img_attrs ::=
"ALIGN" | "ALT" | "BORDER" | "ISMAP" | "LOWSRC"
| "SRC" | "VSPACE" | "HSPACE" | "WIDTH"
| "HEIGHT" | "USEMAP"
// MS Explorer
| "CONTROLS" | "DYNSRC" | "LOOP" | "LOOPDELAY"
| "START" | "VRML" | "NAME" ;
isindex_attrs ::=
"ACTION" | "PROMPT" ;
link_attrs ::= a_attrs;
map_attrs ::= "NAME" ;
marquee_attrs ::=
"ALIGN" | "BEHAVIOR" | "BGCOLOR" | "DIRECTION"
| "HEIGHT" | "HSPACE" | "LOOP" | "SCROLLAMOUNT"
| "SCROLLDELAY" | "VSPACE" | "WIDTH" ;
meta_attrs ::= "CONTENT" | "HTTP-EQUIV" | "NAME" ;
option_attrs ::= "SELECTED" | "VALUE" ;
p_attrs ::= "ALIGN" ;
pre_attrs ::= "WIDTH" ;
select_attrs ::= "MULTIPLE" | "NAME" | "SIZE" ;
table_attrs ::=
"ALIGN" | "BACKGROUND" | "BGCOLOR" | "BORDER"
| "BORDERCOLOR" | "BORDERCOLORDARK"
| "BORDERCOLORLIGHT" | "CELLPADDING" | "CELLSPACING"
| "FRAME" | "HEIGHT" | "RULES" | "VALIGN" | "WIDTH"
;
td_attrs ::=
"ALIGN" | "BACKGROUND" | "BGCOLOR" | "BORDERCOLOR"
| "BORDERCOLORDARK" | "BORDERCOLORLIGHT" | "COLSPAN"
| "HEIGHT" | "NOWRAP" | "ROWSPAN" | "VALIGN"
| "WIDTH"
;
textarea_attrs ::= "WRAP" ;
th_attrs ::= td_attrs ;
tr_attrs ::=
"ALIGN" | "BGCOLOR" | "BORDERCOLOR"
| "BORDERCOLORDARK" | "BORDERCOLORLIGHT" | "VALIGN"
;
// Constraints on attr_values
// append "_value" to attribute name and the production
// will be called automatically by the constraint.
align_value ::=
"CENTER" | "JUSTIFY" | "LEFT"
| "RIGHT" | "TOP" | "BOTTOM" ;
height_value ::= value_or_percent;
size_value ::= ["-"|"+"] numeric;
valign_value ::= "TOP" | "BOTTOM";
value_or_percent ::= numeric ["%"];
width_value ::= value_or_percent;
};
attr_name ::= Markup.attr_name
(? #WARN = 101,
#VALUE ::=
^#ISSYMBOL("Attrs." +
^*start_tag.element_name + "_attrs") *()
| #SYMBOL("Attrs." +
^*start_tag.element_name + "_attrs");
);
attr_value ::= Markup.attr_value
(? #WARN = 102,
#VALUE ::=
^#ISSYMBOL("Attrs." + ^.attr_name + "_value") *()
| #SYMBOL("Attrs." + ^.attr_name + "_value") ;
);
};
|