1    #! /usr/bin/env python
       2    
       3    # Extract line numbers and marked-up text.
       4    #
       5    # SYNTAX EXAMPLE:
       6    #
       7    #   python test.py where.xml > where.xmllib_c
       8    #   spell where.xmllib_c | sort | uniq > where.misspell
       9    #   vi where.misspell
      10    #   egrep -if where.misspell where.xmllib_c > where.misspell.final
      11    #   cat where.misspell.final
      12    # where.xml:340:   calling for the formation of a stong well-organised party, whose
      13    # where.xml:369:   in waging poliiical struggle under all circumstances and at all
      14    # where.xml:421:   sound a vigorous warning against becoming infatuated with
      15    # where.xml:516:   possibly fulfil our task
      16    # where.xml:547:   settlements and comununities, it is quite feasible for the
      17    # where.xml:558:   collective propagandist and a colluctive agitator, it is also a
      18    # where.xml:627:   most skilful propagandists, but the most capable organisers, the
      19    
      20    
      21    
      22    import sys
      23    import xmllib
      24    import string
      25    import fileinput
      26    
      27    # ---------------------------------------------------------
      28    # Unfortunately, documentation in:
      29    #   /usr/share/doc/python/html/lib/module-xmllib.html
      30    # does not match defined class functions in:
      31    #   /usr/lib/python1.5/site-packages/xml/parsers/xmllib.py
      32    # ---------------------------------------------------------
      33    #
      34    class Program(xmllib.XMLParser):
      35    
      36        def stderr(self, message):
      37            if message and (not message[-1] == '\n'): message = message + '\n'
      38            sys.stderr.write(message)
      39            pass
      40        def stdout(self, message):
      41            if message and (not message[-1] == '\n'): message = message + '\n'
      42            sys.stdout.write(message)
      43            pass
      44    
      45        def __init__(self, options):
      46    
      47            self.debug = options['debug']
      48            self.filename_position = options['filename_position']
      49    
      50            # 12.4 xmllib -- A parser for XML documents
      51            # file://localhost/usr/share/doc/python/html/lib/module-xmllib.html
      52            #
      53            # reset ()    Reset the instance. Loses all unprocessed
      54            #             data. This is called implicitly at the
      55            #             instantiation time.
      56            self.reset()
      57            pass
      58    
      59        def __call__(self):
      60    
      61            # if self.debug: self.stderr(str(self.attributes))
      62            # {}
      63            #
      64            # if self.debug: self.stderr(str(self.elements))
      65            # {}
      66            #
      67            # if self.debug: self.stderr(str(self.entitydefs.keys()))
      68            # ['quot', 'apos', 'gt', 'lt', 'amp']
      69            # if self.debug: self.stderr(str(self.entitydefs))
      70            # {'quot': '"', 'apos': ''', 'gt': '>', ...
      71            #
      72            for filename in sys.argv[1:]:
      73                if self.filename_position == 'top': self.stdout(filename)
      74                for line in fileinput.input(filename):
      75                    if line[-1] == '\n': line = line[:-1]
      76                    #
      77                    # self.stderr(line + '\n')
      78                    self.feed(line)
      79                    
      80                    pass
      81                pass
      82            
      83            pass
      84    
      85        def feed(self, data):
      86            xmllib.XMLParser.feed(self, data)
      87            if self.debug: self.stderr(' --->>> ' +data)
      88            pass
      89    
      90        # HANDLEs -------------------------------------------------------
      91        def handle_xml(self, encoding, standalone):
      92            if self.debug: self.stderr('handle_xml: ' + \
                        'encoding=' +str(encoding) +', ' + \
                        'standalone=' +str(standalone)
      95                        )
      96            pass
      97    
      98        def handle_doctype(self, tag, pubid, syslit, data):
      99            # Documentation has: ``handle_doctype (tag, data)''
     100            
     101            # if self.debug: self.stderr('handle_doctype: ' + \
     102            #             'tag=' +tag +', ' + \
     103            #             'pubid=' +pubid +', ' + \
     104            #             'syslit=' +syslit +', ' + \
     105            #             'data=' +data)
     106    
     107            if data == None:
     108                return
     109            
     110            where = None
     111            for i in range(len(data)):
     112                if not where and data[i] == '<':
     113                    if data[i:i+4] == '<!--':
     114                        where = 'comment'
     115                        start = i
     116                        pass
     117                    else:
     118                        pass
     119                    pass
     120                elif where == 'comment':
     121                    if data[i:i+3] == '-->':
     122                        self.handle_comment(data[start+4:i+3-3])
     123                        where = None
     124                        start = None
     125                        pass
     126                    pass
     127    
     128                # self.handle_special(data)
     129                pass
     130            
     131            pass
     132    
     133        def handle_starttag(self, tag, method, attrs):
     134            if self.debug: self.stderr('handle_starttag: ' + tag)
     135            pass
     136    
     137        def handle_endtag(self, tag, method):
     138            if self.debug: self.stderr('handle_endtag: ' + tag)
     139            pass
     140    
     141        def handle_data(self, data):
     142            if self.debug: self.stderr('handle_data: ' + data)
     143    
     144            # CONTENT:
     145            data_down = string.strip(data)
     146            if len(data_down) > 0:
     147                # self.stderr('handle_data: ' + data)
     148                #
     149                if self.filename_position == 'side': spam = fileinput.filename() +':'
     150                else: spam = ''
     151                spam = spam + str(fileinput.lineno()) + ':' + ' ' +data
     152                self.stdout(spam)
     153                            
     154                pass
     155            
     156            pass
     157    
     158        def handle_charref(self, name):
     159            if self.debug: self.stderr('handle_charref: ' + name)
     160            pass
     161    
     162        #
     163        #   771:    def handle_entity(self, name, strval, pubid, syslit, ndata):
     164        def handle_entityref(self, name):
     165            if self.debug: self.stderr('handle_entityref: ' + name)
     166            pass
     167    
     168        def handle_comment(self, data):
     169            if self.debug: self.stderr('handle_comment: ' + data)
     170            pass
     171    
     172        def handle_cdata(self, data):
     173            if self.debug: self.stderr('handle_cdata: ' + data)
     174            pass
     175    
     176        def handle_proc(self, name, data):
     177            if self.debug: self.stderr('handle_proc: ' + data)
     178            pass
     179    
     180        # def handle_special(self, data):
     181        #     if self.debug: self.stderr('handle_special: ' + data)
     182        #     pass
     183    
     184        # SYNTAX_ERROR -------------------------------------------------------
     185        def syntax_error(self, message):
     186            # xmllib.syntax_error(message)
     187            pass
     188        
     189        # UNKNOWNs -------------------------------------------------------
     190        def unknown_starttag(self, tag, attrs):
     191            if self.debug: self.stderr('oh no!  ...an unknown starttag: ' +tag)
     192            pass
     193        def unknown_endtag(self, tag):
     194            if self.debug: self.stderr('oh no!  ...an unknown endtag: ' +tag)
     195            pass
     196        def unknown_charref(self, ref):
     197            if self.debug: self.stderr('oh no!  ...an unknown charref: ' +ref)
     198            pass
     199        def unknown_entityref(self, ref):
     200            if self.debug: self.stderr('oh no!  ...an unknown entityref: ' +ref)
     201            pass
     202    
     203        pass
     204    
     205    
     206    # MAIN:
     207    filename_position = 'top'
     208    filename_position = 'side'
     209    #
     210    options = {'debug': None,
     211               'filename_position': filename_position}
     212    program = Program(options)
     213    program()
     214    
     215    
     216    ###
     217    #
     218    
     219    # NEW definitions (not in documentation):
     220    #    94:    def __fixelements(self):
     221    #   100:    def __fixclass(self, kl):
     222    #   105:    def __fixdict(self, dict):
     223    #   119:    def reset(self):
     224    #   136:    def setnomoretags(self):
     225    #   140:    def setliteral(self, *args):
     226    #   152:    def close(self):
     227    #   160:    def translate_references(self, data, all = 1):
     228    #   201:    def goahead(self, end):
     229    #   382:    def parse_comment(self, i):
     230    #   399:    def parse_doctype(self, res):
     231    #   445:    def parse_cdata(self, i):
     232    #   461:    def parse_proc(self, i):
     233    #   504:    def parse_attributes(self, tag, i, j):
     234    #   539:    def parse_starttag(self, i):
     235    #   615:    def parse_endtag(self, i):
     236    #   642:    def finish_starttag(self, tagname, attrdict, method):
     237    #   649:    def finish_endtag(self, tag):
     238    #   691:    def handle_starttag(self, tag, method, attrs):
     239    #   699:    def handle_charref(self, name):
     240    #   739:    def handle_comment(self, data):
     241    #   771:    def handle_entity(self, name, strval, pubid, syslit, ndata):
     242    #   780:    def flush(self):
     243    #   794:    def handle_comment(self, data):
     244    #   826:    def close(self):
     245    #   830:def test(args = None):
     246    #