#!/usr/bin/env python3

"""A pathetically simple Python script to convert (my) simple graph representations
to a dot(1) configuration file.  Keith Hellman <khellman@mines.edu> for 
Spring 2014, 2016 Compilers.

On a Linux box with dot(1) installed (typically found in the graphvis package),
invoke like this:

 $ ls
 parsetree.txt
 $ cat parsetree.txt | ./tree-to-graphvis | dot -Tpng -o parsetree.png
 $ see parsetree.png

or

 $ display parsetree.png

Input format is in two parts:  node identification and then edge identification.

 $ cat parsetree.txt
 nodeA Node A . shape=box
 leafB Leaf B
 nodeC Node C
 leafD Leaf D
 leafE Leaf E
 
 nodeA leafB nodeC
 nodeC leafD leafE
 $ 

This would generate a graph:
  +--------+
  | Node A |
  +--------+
    /    \
 Leaf B  Node C
         /   \
     Leaf D   \
             Leaf E


The input format is line oriented and does not support comments.  Note that a
single empty line is important dividing the node definitions from edge defs.

The input can be easily generated by a recursive routine in most languages, given
your tree is stored as a "list of lists".  In which case consider making the 
identifer for any node a 0- followed by the sequence of children indices leading
to the node.  In the example graph above, leafB would 0-0 and nodeC would be 0-1,
leafE would be 0-1-1.  A compiler's "raw parse tree" with an identical structure
to the example graph above might look like

0 S
0-0 b
0-1 C
0-1-0 lambda
0-1-1 e

0 0-0 0-1
0-1 0-1-0 0-1-1

(Leaf D has been changed to a lambda for the sake of an example for students.)

='d keypairs following . can be used for dot attributes; see dot language 
definition for possible attributes.

Node names can use a graphvis styled table markup, see 
  https://www.graphviz.org/doc/info/shapes.html#html

-n, -N, -e, -E options can be used for attributes of a specific node, all nodes,
a specific edge, or all edges (respectively).  The syntax looks like 
 
 ... -N shape=egg -e NodeC LeafD color=red ...
  
-g can be used to specify whole graph options.  Eg:

 $ cat parsetree.txt | ./tree-to-graphvis -g 'ratio=1.2;' | dot -Tpng -o parsetree.png

-ns can be used to 'namespace' a graph so that it can be gvpack(1)'d with others
without node identifier (namespace) collisions.  Eg:

 $ for i in 1; do \
    cat parsetree1.txt | ./tree-to-graphvis -ns ptOne ;\
    cat parsetree2.txt | ./tree-to-graphvis -ns ptTwo ; done |\
     gvpack -u | dot -Tpng >g.png

"""

import sys
import re


class Dot_Xlate :
    # note the ^ anchor and +? non-greedy forms
    TABLERC = re.compile(r'^<[Tt][Aa][Bb][Ll][Ee][^>]*>.+?</[Tt][Aa][Bb][Ll][Ee]>')
    TDRC = re.compile(r'^<[Tt][Dd][^>]*>.+?</[Tt][Dd]>')
    def __init__( self, *extra_subs ) :
        d = { 
            'emptyset':'&empty;',
            'bullet':'&bull;',
            'lambda':'&lambda;',
            '->':'&rarr;',
            '<-':'&larr;',
            # use --html=><= option to include these
            #'=>':'&rArr;',
            #'<=':'&lArr;',
            'langle':'&lang;',
            'rangle':'&rang;',
            'prop':'&prop',
            'propto':'&prop;',
            'forall':'&forall;',
            'exist':'&exist;',
            'nabla':'&nabla;',
            'isin':'&isin;',          # latex $\in$, seems to generic to make a key
            'sim':'&sim;',
            'wedge':'&wedge;',
            'vee':'&vee;',
            'cap':'&cap;',
            'cup':'&cup;',
            'perp':'&perp;',
            'times':'&times;',
            'otimes':'&otimes;',
            'oplus':'&oplus;',
            'spade':'&spades;',
            'spadesuit':'&spades;',
            'heart':'&#x2661;',      # &hearts; is a filled shape, hearts are red in a deck though...
            'heartsuit':'&#x2661;',
            'forall':'&forall;',
            'lowast':'&lowast;',
            'ast':'&lowast;',
            'diamond':'&diams;',
            'diamondsuit':'&diams;',
            'clubsuit':'&clubs;',
            'club':'&clubs;',
            'alpha':'&alpha;',
            'beta':'&beta;',
            'gamma':'&gamma;',
             r'\\':r'\\\\',
            }
        self.__D = {} 
        for k, i in d.items() :
            self.__add_sub( k, i )
        for k, i in extra_subs :
            self.__add_sub( k, i )
    def __add_sub( self, k, i ) :
            # we don't want the term replaced if it just a part of the label, 
            # make sure we match the whole and only token
            rc = re.compile( r'''(^|\s+|\\[nlr]|["',{])''' + k + r'''($|\s+|["',}\\])''' )
            self.__D[(k,i)] = [rc]
            # the & and ; make sure to *not* double match against a simple substitution 
            # such as lambda -> &lambda; -> &&lambda;;
            rc = re.compile( r'''([a-zA-Z0-9]*[^-_a-zA-Z0-9&])''' + k + r'''($|\\[nlr]|[^-_a-zA-Z0-9;])''' )
            self.__D[(k,i)].append(rc)
    def __call__( self, t ) :
        if Dot_Xlate.TABLERC.match(t) :
            # a graphvis html style table label, it is already html escaped
            return t
        # __first__ replace an & with &amp;
        orig=str(t)
        for (k,i), Rclist in self.__D.items() :
            for Rc in Rclist :
                #print( "implicit label substitution test '%s' against '%s'" % (t,Rc.pattern) )
                newt = Rc.sub( r'\1'+i+r'\2', t )
                if newt != t :
                    print( "implicit label substitution change '%s' through '%s' -> '%s'" % (t,Rc.pattern,newt), file=sys.stderr )
                t = newt

        # lastly replace any < or >
        t.replace( '<', '&lt;' )
        t.replace( '>', '&gt;' )
#        if not t == orig :
#            print( "implicit label substitution  input: '%s'" % (orig,) )
#            print( "implicit label substitution output: '%s'" % (t,) )
#        else :
#            print( "implicit label substitution retains '%s'" % (t,) )
        return t

dot_xlate = None

def read( f, edgeprops ) :
    nodes = {}
    attrs = {}
    edges = []

    l = f.readline()
    while l :
        l = l.strip()
        if not l :
            l = f.readline()
            continue

        y = l.split( " . " )
        if len(y)==1 :
            nd, att = y[0], ""
        else :
            nd, att = y[0], " . ".join(y[1:]).strip()
        y = nd.split()
        if len(y)==1 :
            node, resid = y[0], [y[0],]
        else :
            node, resid = y[0], y[1:]

        if node in nodes :
            # edges specification
            assert( type(resid) == type([]) )
            for c in resid :
                edges.append( (node,c) )
                edgeprops.setdefault( (node,c), [] )
                if att :
                    if 'label=' in att :
                        edgeprops[ (node,c) ].append( 'labeldistance=3.0' )
                    edgeprops[ (node,c) ].append( att )
        else :
            # node definition and resid is description
            if type(resid) == type([]) :
                resid = " ".join( resid )
            global dot_xlate
            nodes[node] = dot_xlate(resid)
            if att :
                attrs.setdefault(node,[]).append( dot_xlate( att ) )

        l = f.readline()

    return nodes, edges, edgeprops, attrs

# a little confusing, attrs for nodes come from the original input file,
# nodeprops come from command line
def _writenodes( f, ns, nodes, nodeattrs, globnode, nodeprops, sep ) :
    def update_html_label_value( label_value ) :
        # if it is an HTML table label, we use <> as delimiters
        b, e = ('"', '"')
        if Dot_Xlate.TABLERC.match(label_value) :
            b, e = ('<', '>')  
        # abbreviated form, easier to remember and edit by hand
        elif Dot_Xlate.TDRC.match(label_value) :
            b, e = ('<<TABLE BORDER="0" CELLBORDER="0"><TR>', '</TR></TABLE>>')  
        return b, e
    def next_attr_name( input, punct=(';',','), quotes=('"',"'") ) :
        name = ''
        for i in range(len(input)) :
            if input[i] == '=' :
                return name, input[i+1:]
            if input[i].isspace() or input[i] in punct :
                # we should see these only when not having found any name
                # characters yet
                assert( not len(name))
                continue
            name = name + input[i]
        # names are followed by =, we've exhausted input so name should
        # be empty
        assert( not len(name))
        return name, ''
    def next_attr_value(input, punct=(';',','), quotes=('"',"'") ) :
        if not len(input) :
            return ''
        if input[0].isspace() :
            return next_attr_value(input[1:], punct=punct, quotes=quotes )
        if input[0] in quotes :
            # quoted value, possibly escaped chars
            value = input[0]
            esc = False
            for i in range(1,len(input)) :
                if esc :
                    esc = False
                    if input[i] in ('n',) :
                        value = value + '\\' + input[i]
                    else :
                        value = value + input[i]
                    continue
                if input[i] == '\\' :
                    esc = True
                    continue
                value = value + input[i]
                #print( "value", value )
                if input[i]==input[0] :
                    # closing quote, already stored in value
                    #print( "return", value, value, input[i+1:] )
                    return value, input[i+1:]
            # unterminated quote
            assert( not value )
        else :
            # unquoted value may be an HTML formulation
            for htmlre in ( Dot_Xlate.TABLERC, Dot_Xlate.TDRC ) :
                m = htmlre.match(input)
                if m :
                    return input[m.start():m.end()], input[m.end():]
            # unquoted value, read to the end, space or punct
            value = input[0]
            for i in range(1,len(input)) :
                if input[i] in punct or input[i].isspace() :
                    return value, input[i+1:]
                value = value + input[i]
            return value, '' 
    def key_value_scan_attrib_data( input ) :
        attr = []
        if type(input) is type([]) :
            for x in input :
                if not x :
                    continue
                attr.extend( key_value_scan_attrib_data( x ))
        elif type(input) is type("") :
            attrname, input = next_attr_name( input ) 
            while len(input) and not attrname :
                attrname, input = next_attr_name( input ) 
#            print( "attrname", attrname, "input", input )
            if not attrname :
                return attr
            attrvalue, input = next_attr_value( input )
#            print( "attrname", attrname, "attrvalue", attrvalue, "input", input )
            attr.append( '%s=%s' % (attrname, attrvalue,) )
            attr.extend( key_value_scan_attrib_data(input) )
        return attr

    for n, l in nodes.items() :
        a = nodeattrs.get( n, [] )
        a.extend( globnode )
        a.extend( nodeprops.get( n, [] ) )
#        print( "a in", a )
        a[:] = key_value_scan_attrib_data( a )
#        print( "a out", a )
        # insert properly delim'd label= into a for label value l
        b, e = update_html_label_value( l )
        # it goes in the front as it should be overridden by an explicit label= attr
        a.insert(0,'label=%s%s%s'%(b,l,e,))
        # start at one since we already know a[0] is correct...
        for i in range(1,len(a)) :
            #print( a[i] )
            # convert attrib values to properly delim'd format for graphvis
            attrname, attrvalue = a[i].split('=',1)
            if attrname not in ('label', 'xlabel') :
                continue
            if attrvalue.startswith('"') or attrvalue.startswith("'") :
                # already quoted, don't modify
                continue
            b, e = update_html_label_value( attrvalue )
            a[i] = '%s=%s%s%s'%(attrname,b,attrvalue,e,)
        f.write( '"%s%s" [%s];' % ( ns, n, ", ".join(a) ) )
        f.write( sep )

def _writeedges( f, ns, edges, globedge, edgeprops, sep, edgesym ) :
    for n, c in edges :
        attrs = []
        attrs.extend( globedge )
        attrs.extend( edgeprops.get( (n,c), [] ))
        if attrs :
            f.write( '"%s%s" %s "%s%s" [%s];' % ( ns, n, edgesym, ns, c, ",".join(attrs) ) )
        else :
            f.write( '"%s%s" %s "%s%s";' % ( ns, n, edgesym, ns, c ) )
        f.write( sep )

def treewrite( f, ns, nodes, edges, attrs, graphprops=[], globnode=[], nodeprops={}, globedge=[],
        edgeprops={}, sep="\n  ", edgesym='--' ) :
    f.write( "graph Pt {" + sep + sep.join( ["ordering=out;",] + graphprops ) + sep )
    _writenodes( f, ns, nodes, attrs, globnode, nodeprops, sep )
    _writeedges( f, ns, edges, globedge, edgeprops, sep, edgesym )
    f.write( "}\n" );

def fawrite( f, ns, nodes, edges, attrs, graphprops=[], globnode=[], nodeprops={}, globedge=[], 
        edgeprops={}, sep="\n  ", edgesym='->' ) :
    f.write( "digraph Fa {" + (sep if graphprops else '') + sep.join( graphprops ) + sep )
    _writenodes( f, ns, nodes, attrs, globnode, nodeprops, sep )
    _writeedges( f, ns, edges, globedge, edgeprops, sep, edgesym )
    f.write( "}\n" );

def sc( gp ) :
    return [x if x.endswith(';') else x + ';' for x in gp]

import os

n = os.path.basename(sys.argv[0]).split('-')[0]
if n in ( 'tree', 'parsetree' ) :
    write=treewrite
elif n == 'fa' :
    write=fawrite
else :
    n=treewrite

sctail = re.compile( r';\s*$' )
output=None
infile=None
filecount=0
graphprops=[]
edgeprops={}
nodeprops={}
globedge=[]
globnode=[]
ns=''   # namespace prefix
a=1
dot_xlate = Dot_Xlate()
while a < len(sys.argv) :
    if sys.argv[a] in ("-ns",) :
        # namespace support, generate each state (node) with a namespace prefix, so
        # that gvpack -u  can be used to merge multiple graphs together
        # an underscore is auto added for readability (heaven forbid you have to read
        # one of these...)
        ns = sys.argv[a+1]
        if not ns.endswith('_') :
            ns = ns + '_'
        a+=2
        continue
    if sys.argv[a] in ("--html=><=",) :
        # replace => or <= with &rArr; and &lArr
        dot_xlate = Dot_Xlate( ('=>','&rArr;'), ('<=','&lArr;') )
        a+=1
        continue
    if sys.argv[a] in ("-n","--nodeprop",) :
        nodeprops.setdefault( sys.argv[a+1], [] ).append( dot_xlate(sys.argv[a+2]) )
        a+=3
        continue
    if sys.argv[a] in ("-N","--global-node",) :
        globnode.append( dot_xlate(sys.argv[a+1]) )
        a+=2
        continue
    if sys.argv[a] in ("-e","--edgeprop",) :
        edgeprops.setdefault( (sys.argv[a+1],sys.argv[a+2]), [] ).append( dot_xlate(sys.argv[a+3]) )
        a+=4
        continue
    if sys.argv[a] in ("-E","--global-edge",) :
        globedge.append( dot_xlate(sys.argv[a+1]) )
        a+=2
        continue
    if sys.argv[a] in ("-g","--graphprop",) :
        graphprops.append( sctail.sub( '', sys.argv[a+1] ) )
        a+=2
        continue
    if sys.argv[a] in ("-o","--output",) :
        output=sys.argv[a+1]
        a+=2
        continue
    if sys.argv[a] == "-" :
        n, e, edgeprops, att = read( sys.stdin, edgeprops )
        write( sys.stdout if output in (None,"-") else open(output,"w"), ns, n, e, att, 
                graphprops=sc(graphprops), globnode=globnode, nodeprops=nodeprops,
                globedge=globedge, edgeprops=edgeprops )
        output=None
        a+=1
        filecount+=1
        continue
    if not output :
        if sys.argv[a].endswith( ".gv" ) :
            # wierd, but don't overwrite
            output = sys.argv[a]+".gv"
        elif sys.argv[a] != "-" :
            output = os.path.splitext(sys.argv[a])[0] + ".gv"
    n, e, edgeprops, att = read( open(sys.argv[a]), edgeprops )
    write( sys.stdout if output in (None,"-") else open(output,"w"), ns, n, e, att, 
            graphprops=sc(graphprops), globnode=globnode, nodeprops=nodeprops,
            globedge=globedge, edgeprops=edgeprops )
    output = None
    a+=1
    filecount+=1
    continue

if not filecount :
    n, e, edgeprops, att = read( sys.stdin, edgeprops )
    write( sys.stdout if output in (None,"-") else open(output,"w"), ns, n, e, att, 
            graphprops=sc(graphprops), globnode=globnode, nodeprops=nodeprops,
            globedge=globedge, edgeprops=edgeprops )

sys.exit(0)

