#!/usr/bin/python
""" slurpIcalSpec.py -- extract formal view of iCalendar/vCard specs

produces XHTML with various typed links

@@TODO: make each marked up item visible via css
(unless they're all linked).

see also: rfc2html in dev.w3.org @@


"""

__version__ = '$Id: slurpIcalSpec.py,v 1.27 2005/11/09 23:10:49 connolly Exp $'
#see also: changelog at end

import sys
import re


class Usage(Exception):
    """USAGE: python slurpIcalSpec.py NNNN <rfcNNNN.txt >rfcNNNN.html
    where NNNN is one of the supported RFCS
    (so far: 2425, 2426, and 2445)
    """

# rfcnum: (footerStarters, typos, example_tags)
Specs = { 2445: (("RFC", "Dawson"),
                 (('"EVENT"', '"VEVENT"'), # 4.8.7.3 Last Modified
                  ('   Purpose This value type',
                   '   Purpose: This value type'),
                  ('11.  Full Copyright Statement',
                   '11  Full Copyright Statement'),
                  ('3.11 Contact for Further Information:',
                   '3.11 Contact for Further Information'),
                  ),
                 ('Example',),
                 ),
          2426: (("RFC", "Dawson"),
                 # a ref is split across lines; put it on one line
                 (('Hence, this [MIME-',
                   'Hence, this [MIME-DIR]'),
                  ('DIR] profile is',
                   ' profile is'),
                  ),
                 ('Type example',),
                 ),
          2425: (("RFC", "Howes"),
                 (),
                 (),
                 )
          }

def main(argv):
    if len(argv) == 2:
        try:
            rfcnum = int(argv[1])
            pgbrks, typos, exampleTags = Specs[rfcnum]
        except (ValueError, KeyError):
            raise Usage()
    else:
        raise Usage()

    fp = sys.stdin
    lines = depaginate(fp, pgbrks, typos)
    sections = bySection(lines)

    head = sections.next()
    #print >>sys.stderr, "RFC header:", `head`
    title, rfcnum, category, date, authlines = titleEtc(head)
    print htmlTop(title, rfcnum, category, date, authlines, exampleTags)

    w = sys.stdout.write

    sections = list(sections)
    refs = findRefs(sections)

    for sec in sections:
        if sec[0].find("Table of Contents") >= 0:
            tocSect(w, sec)
        elif sec[0][0].isdigit():
            numSect(w, sec, refs)
        else:
            flowSect(w, sec, refs)


    print htmlBot()


def depaginate(fp, footerStarters, typos):
    """undo RFC pagination: generate sequence of lines
    skip lines at top of page


    'Each page must be limited to 58 lines followed by a form feed on a
     line by itself.'
      --Instructions to RFC Authors
        Postel & Reynolds Oct 1997
        http://www.ietf.org/rfc/rfc2223
    """

    sv = []
    top = 1
    
    while 1:
        line = fp.readline()
        if not line: break

        line = line.rstrip("\r\n")

        for err, fix in typos:
            if err in line:
                line = line.replace(err, fix)

        for s in footerStarters:
            if line.startswith(s):
                line = "\f"
                break

        if line == "\f":
            sv = [''] # leave a blank line where the pagebreak was
            top = 1
            continue

        if len(line):
            if sv:
                for l in sv:
                    yield l
                sv = []
            yield line
            top = 0
        else:
            if not top:
                sv.append(line)


def bySection(lines):
    sec = []
    for l in lines:

        if len(l):
            if l.startswith(" ") or \
                   l.startswith("Request for Comments") or \
                   l.startswith("Category:") or \
                   l.find(":") >= 0 or \
                   l.find("<") >= 0 or \
                   l.find("--") >= 0:
                sec.append(l)
            else:
                if len(sec):
                    yield sec
                sec = []
                sec.append(l)
        else:
            sec.append(l)
    if len(sec):
        yield sec


def titleEtc(lines):
    sep = None
    for idx in range(0, len(lines)):
        if lines[idx].strip() == '':
            sep = idx
            break
    if sep is None:
        raise ValueError, \
              "no blank line separating header from title: " + `lines`
    title = ' '.join(map(lambda l: l.strip(), lines[sep+1:])).strip()
    rfcnum = lines[1][:40].strip().split()[-1]
    category = lines[2][:40].strip().split(':')[1].strip()
    date = lines[sep-1].strip()
    authlines = []
    for idx in range(0, sep-1):
        authlines.append(lines[idx][40:].strip())

    return title, rfcnum, category, date, authlines


def findRefs(sections):
    for sec in sections:
        if not sec[0].strip().endswith("References"): continue

        refs = []
        state = ''
        for ln in sec[1:]:
            ln = ln.strip()
            if ln == '':
                state = ''
                continue
            if state == '':
                junk, ln = ln.split('[', 1)
                ref, ln = ln.split(']', 1)
                #print >>sys.stderr, "found ref:", ref
                refs.append(ref)

                state = 'inref'
        return refs
    raise ValueError, "no References section found"


def htmlTop(title, rfcnum, category, date, authlines, exampleTags):
    css = ""
    for t in exampleTags:
        css = css + "dd.%s {   border-style: solid;  border-color: #d0dbe7;  border-width: 0 0 0 .25em;  padding-left: 0.5em;\n" % asClass(t)

    html = """
<html xmlns='http://www.w3.org/1999/xhtml'>
<head profile='http://www.w3.org/2003/g/data-view'>
  <title>%s</title>
  <link rel="transformation"
    href="http://www.w3.org/2002/12/cal/webize2445.xsl"/>
  <style type="text/css">
  %s
  </style>
</head>
<body>
<table>
<tr><td>Network Working Group<br />
Request for Comments: %s<br />
Category: %s<br />
</td>
<td>%s</td>
</tr>
<tr><td>&#160;</td><td>%s</td></tr>
</table>
<h1>%s</h1>
<address>
$Revision: 1.27 $ of $Date: 2005/11/09 23:10:49 $
derived from <a href="http://www.ietf.org/rfc/rfc%s.txt">rfc%s.txt</a>
and enhanced
for <a href="http://www.w3.org/2004/01/rdxh/spec">gleaning formal description</a>
using <a href="slurpIcalSpec.py">slurpIcalSpec.py</a>
by <a href="http://www.w3.org/People/Connolly/">Dan Connolly</a>
</address>

""" % (title, css, rfcnum, category, '<br />'.join(authlines), date, title,
       rfcnum, rfcnum)

    return html

def htmlBot():
    return "</body></html>"

def flowSect(w, lines, refs):
    w("<div><h2>%s</h2>\n" % (lines[0],))
    flowSectRest(w, lines, refs)

def flowSectRest(w, lines, refs):
    p = 0
    for l in lines[1:]:
        if l.strip():
            if not p: w("<p>\n")
            bodyText(w, l, refs)
            w("\n")
            p = 1
        else:
            if p: w("</p>\n")
            p = 0
    if p: w("</p>\n")
    w("</div>")


def tocSect(w, lines):
    w("<div><h2>%s</h2>\n" % (lines[0],))
    w("<ul type='none'>\n")
    for l in lines[1:]:
        l = l.strip()
        if l == '': continue
        if l[0].isdigit():
            num, l = l.split(None, 1)
        else: num = ''
        head = l.split(".", 1)[0]
        w("<li><a href='#sec%s'>%s %s</a></li>\n" % (num, num, head))
    w("</ul></div>")


def refSect(w, lines):
    state = ''
    ref = None
    dd = None

    w("<dl>\n")
    for ln in lines[1:]:
        ln = ln.strip()
        if ln == '':
            if dd: refEntry(w, ref, dd)
            state = ''
            continue
        if state == '':
            junk, ln = ln.split('[', 1)
            ref, dd = ln.split(']', 1)
            dd = dd + "\n"
            state = 'inref'
        elif state == 'inref':
            dd = dd + ln + "\n"

    if dd: refEntry(w, ref, dd)

    w("</dl>\n")
    w("</div>\n")


def refEntry(w, ref, dd):
    """write a reference entry

    >>> import StringIO
    >>> w = StringIO.StringIO()
    >>> refEntry(w.write, 'IMIP', 'Dawson, F., Mansour, S. and S. Silverberg, "iCalendar Message-based Interoperability Protocol (IMIP)", RFC 2447, November 1998.'); w.getvalue()
    "<dt id='ref_IMIP'>[IMIP]</dt>\n<dd>Dawson, F., Mansour, S. and S. Silverberg, <cite><a href='http://www.ietf.org/rfc/rfc2447'>iCalendar Message-based Interoperability Protocol (IMIP)</a></cite>, RFC 2447, November 1998.</dd>\n"

    """
    w("<dt id='%s'>[%s]</dt>\n" % (asID(ref), ref))
    w("<dd>")

    # try to mark up the title
    parts = dd.split('"')
    if len(parts) == 3:
        before, title, after = parts
        doChars(w, before)

        w("<cite>")

        # try to make it a link
        href = None
        if ref.startswith("RFC "):
            href = rfcAddr(ref.split(' ')[1])
        else:
            m = re.search('RFC (\d\d\d\d?)', dd)
            if m:
                href = rfcAddr(m.group(1))
            else:
                m = re.search(r'((http|ftp)://[^ ,]+)', dd)
                if m:
                    href = m.group(1)
        if href:
            w("<a href='%s'>" % href)
            doChars(w, title)
            w("</a>")
        else:
            doChars(w, title)
        w("</cite>")
        doChars(w, after)
    else:
        doChars(w, dd)

    w("</dd>\n")
    

def rfcAddr(num):
    return 'http://www.ietf.org/rfc/rfc%s' % num

def asID(ref):
    """turn a reference label into an ID

    >>> asID('VCARD')
    'ref_VCARD'
    >>> asID('RFC 1872')
    'ref_RFC_1872'
    """
    
    return 'ref_' + ref.replace(' ', '_')


def numSect(w, lines, refs):
    num, head = lines[0].split(None, 1)

    w("<div><h2 id='sec%s'>%s %s</h2>\n" % (num, num, head))

    #print >>sys.stderr, "numSect:", num, head
    
    if head == "References":
        refSect(w, lines)
    elif head == 'Full Copyright Statement' or \
             head == 'Acknowledgements' or \
             head == 'Acknowledgments' or \
             head == 'Abstract':
        flowSectRest(w, lines, refs)
    elif lines[2].startswith("   Property Name:"):
        doStructuredSection(w, lines, refs, "Property", 'Property')
    elif lines[2].startswith("   Value Name:") or \
         lines[2].startswith("     Value Name:"): # 4.8.3 indented oddly
        doStructuredSection(w, lines, refs, "Value")
    elif lines[2].startswith("   Component Name:"):
        doStructuredSection(w, lines, refs, "Component", 'Class')
    elif lines[2].startswith("   Parameter Name:"):
        doStructuredSection(w, lines, refs, "Parameter", 'Property')
    elif lines[2].startswith("   To: ietf-mime-directory@imc.org"):
        doStructuredSection(w, lines, refs, "Type")
    else:
        w("<pre>")
        for l in lines[1:]:
            bodyText(w, l, refs)
            w("\n")
        w("</pre>\n</div>\n")


def bodyText(w, txt, refs):
    """ write body text, linking refs
    
    >>> import StringIO
    >>> w = StringIO.StringIO()
    >>> bodyText(w.write, 'abc [def] ghi', ['def']); w.getvalue()
    "abc <a href='#ref_def'>[def]</a> ghi"
    """
    
    for part in txt.split('['):
        for ref in refs:
            if part.startswith(ref + ']'):
                junk, part = part.split(']')
                w("<a href='#%s'>[%s]</a>" % (asID(ref), ref))
                break
        doChars(w, part)


def doChars(w, txt):
    w(txt.replace("&", "&amp;").replace("<", "&lt;"))


import string
# ala iana-token = 1*(ALPHA / DIGIT / "-")
NAMECHARS = string.letters + string.digits + '-'

Tags = ('Purpose',
        'Formal Definition',
        'Value Type',
        'Property Parameters',
        'Property Parameter',
        'Conformance',
        'Description',
        'Format Definition',
        'Example',

        'To',
        'Subject',
        'Type name',
        'Type purpose',
        'Type encoding',
        'Type value',
        'Type special notes',
        'Type example'
        
        )

def doStructuredSection(w, lines, refs, secType, rdfClass=None):
    w("<dl>\n")

    secLabel = '%s Name' % secType


    dt = ''
    dd = []
    
    idx = 2
    while idx < len(lines):
        l = lines[idx]

        if ':' in l:
            hd, rest = l.lstrip().split(":", 1)
            if hd == secLabel or hd in Tags:
                if dd:
                    subSect(w, secType, rdfClass, dt, dd, refs)
                    dd = []
                dt = hd
                dd.append(rest)
            else:
                dd.append(l)
        else:
                dd.append(l)
        idx += 1
    if dd:
        subSect(w, secType, rdfClass, dt, dd, refs)
    w("</dl>\n")
    w("</div>\n")


def subSect(w, secType, rdfClass, dt, dd, refs):
    if dt.endswith(" Name") or dt == 'Type name':
        name = ''.join(dd).strip()

        # VEVENT is quoted extraneously
        if name[0] == '"':
            name = name[1:-1]
            dd[0] = name
        if name.startswith("Any property name with"):
            name = "X-"
        elif rdfClass:
            name = camelCase(name, rdfClass == 'Class')
        else:
            name = secType + "_" + name
        w("<dt id='%s'>%s</dt>\n" % (name, dt))
    else:
        w("<dt>%s</dt>\n" % (dt,))

    w("<dd class='%s'>" % (asClass(dt),))

    if dt == 'Value Type':
        rest = dd[0]
        rel='value-type'
        if '.' in rest:
            name, rest = rest.split('.', 1)
            name = name.strip()
            rest = '.' + rest
        else:
            name = rest.strip()
            rest = ''

        if name.startswith("The default"):
            rel='default-value-type'
            txt = name
            name = txt.split()[-1]
            w(txt[:-len(name)])

        if 'separated' in rest:
            rel = 'list-of'
        w("<a rel='%s' href='#Value_%s'>%s</a> <pre>   %s\n"
          % (rel, name, name, rest))

        for l in dd[1:]:
            l = tokenRefs(w, l, 'allowed-type',
                          {'DATE': 'Value_DATE',
                           'DATE-TIME': 'Value_DATE-TIME',
                           'PERIOD': 'Value_PERIOD',
                           'BINARY': 'Value_BINARY'})
            bodyText(w, l, refs)
            w("\n")

    else:
        tokens = None
        rel = None
        
        if secType == "Property" and \
                        (dt == 'Conformance' or dt == 'Description'):
            rel = 'applies-to'
            tokens = {'VEVENT': 'Vevent',
                      'VTODO': 'Vtodo',
                      'VJOURNAL': 'Vjournal',
                      'VFREEBUSY': 'Vfreebusy',
                      'VTIMEZONE': 'Vtimezone',
                      'VALARM': 'Valarm'
                      }
        elif dt == 'Description' and secType == "Component":
            rel = 'def'
            tokens = {
                'STANDARD': 'standard',
                'DAYLIGHT': 'daylight',
                }
        elif dt == 'Description' and secType == "Value":
            rel = 'def'
            tokens = {
                'FREQ': 'freq',
                'UNTIL': 'until',
                'COUNT': 'count',
                'INTERVAL': 'interval',
                'BYSECOND': 'bysecond',
                'BYMINUTE': 'byminute',
                'BYHOUR': 'byhour',
                'BYDAY': 'byday',
                'BYMONTHDAY': 'bymonthday',
                'BYYEARDAY': 'byyearday',
                'BYWEEKNO': 'byweekno',
                'BYMONTH': 'bymonth',
                'BYSETPOS': 'bysetpos',
                'WKST': 'wkst',
                }

        w("<pre>   ")

        for l in dd:
            if tokens: l = tokenRefs(w, l, rel, tokens)
            bodyText(w, l, refs)
            w("\n")

    w("</pre>\n</dd>\n")


def asClass(t):
    """return heading tag t as a class name
    """
    return t.replace(" ", '')


def tokenRefs(w, l, rel, tokens):
    pat = re.compile('|'.join(tokens.keys()))
    seen = {}
    while l:
        m = pat.search(l)
        if not m: break

        doChars(w, l[:m.start()])
        t = l[m.start():m.end()]
        if rel == 'def' and not seen.has_key(tokens[t]):
            w('<a id="%s" rel="%s" href="#%s">%s</a>' % (tokens[t], rel,
                                                         tokens[t], t))
            seen[tokens[t]] = 1
        else:
            w('<a rel="%s" href="#%s">%s</a>' % (rel, tokens[t], t))
        l = l[m.end():]
    return l

def camelCase(n, initialCap=0):
    words = map(lambda w: w.lower(), n.split('-'))

    def ucfirst(w):
        return w[0].upper() + w[1:]
    
    if initialCap:
        return ''.join(map(ucfirst, words))
    else:
        return words[0] + ''.join(map(ucfirst, words[1:]))
        

def _test():
    import doctest
    doctest.testmod()


if __name__ == '__main__':
    if '--test' in sys.argv:
        _test()
    else:
        try:
            main(sys.argv)
        except Usage, e:
            print >>sys.stderr, e.__doc__

# $Log: slurpIcalSpec.py,v $
# Revision 1.27  2005/11/09 23:10:49  connolly
# - changed the way duration values are modelled
#     The iCalendar DURATION value type is actually more than just a
#     XMLSchema.duration; it also has a RELATED parameter.
#     So for
#       TRIGGER;VALUE=DURATION;RELATED=START:-PT15M
#     we'll write
#       { ?E cal:trigger [ rdf:value "-PT15M"^^xsdt:duration;
#                          cal:related "START"] }
#
# - fixed test data to have rdf:datatype on integer
#   values, to match the schema (which matches the RFC)
#
# - fixed schema to show DATE-TIME properties (dtstart, ...)
#   as DatatypeProperties
#   (there are little/no tests for PERIOD; beware)
#
# - scraped more details about property parameters (e.g. partstat, cn,
#   cutype, ...) and rrule parts (freq, interval, ...) from the RFC so
#   that they show up as links in the hypertext version and as RDF
#   properties in the schema.  likewise timezone components (standard,
#   daylight)
#  - side effect: added some whitespace in rfc2445.html
#
# - demoted x- properties
#  - removed x- properties from .rdf versions of test data
#    this allows the round-trip tests to pass
#  - fromIcal.py doesn't output them unless you give the --x option
#
# - added Makefile support for consistency checking with pellet
#
# - demoted blank line diagnostic in fromIcal.py to a comment
#
# - silenced some left-over debug diagnostics in slurpIcalSpec.py
#
# - fixed test/test-created.rdf; added it to fromIcalTest.py list
#
# Revision 1.26  2005/07/22 21:14:32  connolly
# remove : from iCalendar heading
#
# Revision 1.25  2005/07/22 21:00:00  connolly
# - added support for RFC2425, which has
#  - numbered Abstract and TOC
#  - examples that start in column 1
#
# Revision 1.24  2005/07/22 20:42:12  connolly
# - handle VCARD structured section tags
# - working on example extraction; started with CSS style
# - no bullets on TOC items; just the numbers
# - handle a ref split across lines in VCARD as a couple typos
#
# Revision 1.23  2005/07/22 19:51:28  connolly
# - parameterize RFC-specific bits so it works for RFC2426 also
# - factor out typo handling
# - take RFC number on command line; write diagnostic for incorrect usage
#
# Revision 1.22  2005/07/22 19:28:18  connolly
# - mark up titles in bibliography; make links to RFCs
# - render copyright, acks sections flowed rather than pre
# - fix extra . at end of section ID
#
# Revision 1.21  2005/07/22 18:49:58  connolly
# - handle references in 2 passes
#  - 1st pass to find ref labels in refs section
#  - 2nd pass to format references from the body and the bibliography
# - added some unit tests and a --test option
# - handle unnumbered overview section in TOC
#
# Revision 1.20  2004/02/29 14:52:00  connolly
# new grddl names
#
# Revision 1.19  2004/02/12 06:31:23  connolly
# fix EVENT to VEVENT typo
#
# Revision 1.18  2004/02/08 03:30:54  connolly
# allow or odd indentation of 4.3.8 Integer
#
# Revision 1.17  2004/02/08 00:06:03  connolly
# find domain info in Descriptions of properties as well as Conformance
#
# Revision 1.16  2004/02/07 06:30:12  connolly
# take out broken conformance links to Vcalendar
#
# Revision 1.15  2004/02/07 06:02:02  connolly
# - links from property conformance subsections to components
#
# Revision 1.14  2004/02/07 05:31:21  connolly
# - handle Property Name: Any ... X-
# - add purposes to formal schema as rdfs:comment
#
# Revision 1.13  2004/02/07 05:21:33  connolly
# - simplify subSect
# - fix a typo in RFC 2445
#
# Revision 1.12  2004/02/07 04:55:50  connolly
# - refactored doStructuredSection to collect dd lines
# - use doChars() to fix a bug noted in a comment
# - removed deblank (dead code)
# - removed list(lines) (debugging code)
#
# Revision 1.11  2004/02/07 04:30:41  connolly
# - use generators for depagination, section splitting
# - cite RFC guidelines RFC
# - factor out some hard-coded strings
#
# Revision 1.10  2004/02/07 02:39:10  connolly
# doStructuredSection was getting out of hand;
# refactored it before working on Conformance section
#
# Revision 1.9  2004/02/07 00:04:37  connolly
# find more value type info; turn into allowed-type links
#
# Revision 1.8  2004/02/01 07:43:16  connolly
# recognize (though do not fully handle) value types with defaults
#
# Revision 1.7  2004/02/01 06:55:11  connolly
# add provenance in address element
#
# Revision 1.6  2004/01/30 01:15:39  connolly
# fixed case/hypenation
#
# Revision 1.5  2004/01/29 19:40:47  connolly
# added profile for GRDDL
#
# Revision 1.4  2004/01/29 16:49:32  connolly
# first steps towards gleaning a schema from RFC2445 via XHTML, XSLT
#
# Revision 1.3  2004/01/28 10:29:46  connolly
# handle (some cases) of prose in the Value Type field
# handle more section types
#
# Revision 1.2  2004/01/28 10:02:03  connolly
# groks quite a bit more structure
#
# Revision 1.1  2004/01/28 08:54:24  connolly
# produces pretty reasonable XHTML
#