1    #!/usr/bin/python
       2    
       3    # This is derived from studying: /usr/lib/python1.5/*mllib.py
       4    #
       5    # Time-stamp: <2002-03-26 01:06:52 cymbala>
       6    # -------------------------------------------------------
       7    
       8    
       9    # SYNTAX EXAMPLE:
      10    #
      11    #  $ python getanchr.py index.html | sort -t'	' -k 1,1 -k 2,2
      12    #  index.htm	#a	[A]
      13    #  index.htm	#an	[A]
      14    #  index.htm	/home/works/cw/volume07.htm	Collected Works Volume 7
      15    #  index.htm	/home/works/index.htm	Lenin Works Archive
      16    #  ...
      17    #  index.htm	mailto:cymbala@lafn.org	Robert Cymbala
      18    #  $
      19    #
      20    # Output is in the form of three tab-delimited columns:
      21    #    1. source file.
      22    #    2. href=value.
      23    #    3. data surrounded by anchor element (i.e., <a></a>).
      24    #
      25    # -------------------------------------------------------
      26    
      27    
      28    import os
      29    import string
      30    from sgmllib import SGMLParser
      31    
      32    class MySGMLParser(SGMLParser):
      33    
      34        # This is derived from "class TestSGMLParser" in
      35        # file "/usr/lib/python1.5/sgmllib.py".
      36    
      37        def __init__(self, in_file, verbose=0):
      38            self.in_file = os.path.abspath(in_file)
      39            self.mydata = ""
      40            SGMLParser.__init__(self, verbose)
      41    
      42            # Taken from file "/usr/lib/python1.5/htmllib.py".
      43            self.savedata = None
      44            self.anchorlist = []
      45            self.nofill = 0
      46    
      47            self.base = os.path.dirname(self.in_file)
      48            if self.base == '' or self.base == '.':
      49                self.base = os.getcwd()
      50                pass
      51    
      52            pass
      53    
      54        def print_out(self, args_passed):
      55            args = args_passed
      56            args[0] = self.shorten_abspath(args[0])
      57            if not args[1][:7] == 'mailto:': args[1] = self.shorten_abspath(
      58                os.path.normpath(
      59                os.path.join(self.base, args[1])))
      60            print '%s\t%s\t%s' % (args[0], args[1], args[2])
      61            pass
      62    
      63        def handle_data(self, data):
      64            self.mydata = self.mydata + data
      65    
      66            # From file "/usr/lib/python1.5/htmllib.py".
      67            if self.savedata is not None:
      68                self.savedata = self.savedata + data
      69                pass
      70            
      71            if len(`self.mydata`) >= 70:
      72                self.flush()
      73                pass
      74            pass
      75    
      76        def flush(self):
      77            data = self.mydata
      78            if data:
      79                self.mydata = ""
      80                #
      81                # print 'data:', `data`
      82                pass
      83            pass
      84    
      85        def close(self):
      86            SGMLParser.close(self)
      87            self.flush()
      88            pass
      89    
      90    
      91        # Taken from file "/usr/lib/python1.5/sgmllib.py".
      92        def unknown_starttag(self, tag, attrs):
      93            self.stack.append(tag)
      94            pass
      95        #
      96        # Prevent some elements from getting into stack.
      97        def do_hr(self, attrs): pass
      98        def do_br(self, attrs): pass
      99        def do_area(self, attrs): pass
     100        def do_img(self, attrs): pass
     101        def do_base(self, attrs): pass
     102        # def do_link(self, attrs): pass
     103        def do_meta(self, attrs): pass
     104        # -------------------------------------------------------
     105    
     106    
     107    
     108        # Taken from file "/usr/lib/python1.5/htmllib.py".
     109        def save_bgn(self):
     110            self.savedata = ''
     111            pass
     112        #
     113        def save_end(self):
     114            data = self.savedata
     115            self.savedata = None
     116            if not self.nofill:
     117                data = string.join(string.split(data))
     118                pass
     119            return data
     120        #
     121        #
     122        def start_a(self, attrs):
     123            href = ''
     124            name = ''
     125            type = ''
     126            for attrname, value in attrs:
     127                value = string.strip(value)
     128                if attrname == 'href':
     129                    href = value
     130                if attrname == 'name':
     131                    name = value
     132                if attrname == 'type':
     133                    type = string.lower(value)
     134                    pass
     135                pass
     136            self.anchor_bgn(href, name, type)
     137            pass
     138        #        
     139        def end_a(self):
     140            self.anchor_end()
     141            pass
     142        #
     143        #
     144        def anchor_bgn(self, href, name, type):
     145            self.anchor = href
     146            if self.anchor:
     147                self.anchorlist.append(href)
     148                pass
     149    
     150            # Save data in anchor element:
     151            self.save_bgn()
     152            pass
     153        #            
     154        def anchor_end(self):
     155            if self.anchor:
     156                #
     157                # Save data in anchor element:
     158                self.anchor_data = self.save_end()
     159                #
     160                self.print_out([self.in_file,
     161                                self.anchor,
     162                                self.anchor_data])
     163                
     164                self.handle_data("[%d]" % len(self.anchorlist))
     165                self.anchor = None
     166                pass
     167            pass
     168        #
     169        def do_base(self, attrs):
     170            for a, v in attrs:
     171                if a == 'href':
     172                    self.base = v
     173                    pass
     174                pass
     175            pass
     176        #
     177        def do_link(self, attrs):
     178            href = others = ''
     179            for a, v in attrs:
     180                if a == 'href': href = v
     181                else: others = others + ',' + v
     182                pass
     183            if others == '': others = ','
     184            if href:
     185                self.print_out([self.in_file, href, others[1:]])
     186            pass
     187        # -------------------------------------------------------
     188    
     189        def shorten_abspath(self, path):
     190            common_root = '/www/public_html/'
     191            if path[:len(common_root)] == common_root:
     192                return path[len(common_root)-1:]
     193            else: return path
     194            pass
     195    
     196        pass
     197    
     198    
     199    def my_program(args_passed = None):
     200    
     201        # This is derived from "def test" in
     202        # file "/usr/lib/python1.5/sgmllib.py".
     203    
     204        import sys
     205    
     206        # Example in /usr/lib/python1.5/sgmllib.py is for testing only, and
     207        # example in /usr/lib/python1.5/htmllib.py is for testing only.
     208        # They don't handle multiple input files; this one does.
     209        import getopt
     210    
     211        options = 's'
     212        if args_passed:
     213            optlist, args = getopt.getopt(args_passed, options)
     214            pass
     215        else:
     216            optlist, args = getopt.getopt(sys.argv[1:], options)
     217            pass
     218    
     219        if args == []: args = ['-']
     220    
     221        for file in args:
     222            if file == '-':
     223                f = sys.stdin
     224                pass
     225            else:
     226                try:
     227                    f = open(file, 'r')
     228                    pass
     229                except IOError, msg:
     230                    print file, ":", msg
     231                    sys.exit(1)
     232                    pass
     233                pass        
     234    
     235            data = f.read()
     236            #
     237            if file is not '-':
     238                f.close()
     239                pass
     240    
     241            klass = MySGMLParser
     242            x = klass(file)
     243            #
     244            for c in data:
     245                x.feed(c)
     246                pass
     247            x.close()
     248            pass
     249        pass
     250    
     251    my_program()
     252    
     253    ###
     254    #