1 #!/usr/bin/python 2 3 # This is derived from studying: /usr/lib/python1.5/*mllib.py 4 # 5 # Time-stamp: <2002-03-26 01:06:52 cymbala> 6 # ------------------------------------------------------- 7 8 9 # SYNTAX EXAMPLE: 10 # 11 # $ python getanchr.py index.html | sort -t' ' -k 1,1 -k 2,2 12 # index.htm #a [A] 13 # index.htm #an [A] 14 # index.htm /home/works/cw/volume07.htm Collected Works Volume 7 15 # index.htm /home/works/index.htm Lenin Works Archive 16 # ... 17 # index.htm mailto:cymbala@lafn.org Robert Cymbala 18 # $ 19 # 20 # Output is in the form of three tab-delimited columns: 21 # 1. source file. 22 # 2. href=value. 23 # 3. data surrounded by anchor element (i.e., <a></a>). 24 # 25 # ------------------------------------------------------- 26 27 28 import os 29 import string 30 from sgmllib import SGMLParser 31 32 class MySGMLParser(SGMLParser): 33 34 # This is derived from "class TestSGMLParser" in 35 # file "/usr/lib/python1.5/sgmllib.py". 36 37 def __init__(self, in_file, verbose=0): 38 self.in_file = os.path.abspath(in_file) 39 self.mydata = "" 40 SGMLParser.__init__(self, verbose) 41 42 # Taken from file "/usr/lib/python1.5/htmllib.py". 43 self.savedata = None 44 self.anchorlist = [] 45 self.nofill = 0 46 47 self.base = os.path.dirname(self.in_file) 48 if self.base == '' or self.base == '.': 49 self.base = os.getcwd() 50 pass 51 52 pass 53 54 def print_out(self, args_passed): 55 args = args_passed 56 args[0] = self.shorten_abspath(args[0]) 57 if not args[1][:7] == 'mailto:': args[1] = self.shorten_abspath( 58 os.path.normpath( 59 os.path.join(self.base, args[1]))) 60 print '%s\t%s\t%s' % (args[0], args[1], args[2]) 61 pass 62 63 def handle_data(self, data): 64 self.mydata = self.mydata + data 65 66 # From file "/usr/lib/python1.5/htmllib.py". 67 if self.savedata is not None: 68 self.savedata = self.savedata + data 69 pass 70 71 if len(`self.mydata`) >= 70: 72 self.flush() 73 pass 74 pass 75 76 def flush(self): 77 data = self.mydata 78 if data: 79 self.mydata = "" 80 # 81 # print 'data:', `data` 82 pass 83 pass 84 85 def close(self): 86 SGMLParser.close(self) 87 self.flush() 88 pass 89 90 91 # Taken from file "/usr/lib/python1.5/sgmllib.py". 92 def unknown_starttag(self, tag, attrs): 93 self.stack.append(tag) 94 pass 95 # 96 # Prevent some elements from getting into stack. 97 def do_hr(self, attrs): pass 98 def do_br(self, attrs): pass 99 def do_area(self, attrs): pass 100 def do_img(self, attrs): pass 101 def do_base(self, attrs): pass 102 # def do_link(self, attrs): pass 103 def do_meta(self, attrs): pass 104 # ------------------------------------------------------- 105 106 107 108 # Taken from file "/usr/lib/python1.5/htmllib.py". 109 def save_bgn(self): 110 self.savedata = '' 111 pass 112 # 113 def save_end(self): 114 data = self.savedata 115 self.savedata = None 116 if not self.nofill: 117 data = string.join(string.split(data)) 118 pass 119 return data 120 # 121 # 122 def start_a(self, attrs): 123 href = '' 124 name = '' 125 type = '' 126 for attrname, value in attrs: 127 value = string.strip(value) 128 if attrname == 'href': 129 href = value 130 if attrname == 'name': 131 name = value 132 if attrname == 'type': 133 type = string.lower(value) 134 pass 135 pass 136 self.anchor_bgn(href, name, type) 137 pass 138 # 139 def end_a(self): 140 self.anchor_end() 141 pass 142 # 143 # 144 def anchor_bgn(self, href, name, type): 145 self.anchor = href 146 if self.anchor: 147 self.anchorlist.append(href) 148 pass 149 150 # Save data in anchor element: 151 self.save_bgn() 152 pass 153 # 154 def anchor_end(self): 155 if self.anchor: 156 # 157 # Save data in anchor element: 158 self.anchor_data = self.save_end() 159 # 160 self.print_out([self.in_file, 161 self.anchor, 162 self.anchor_data]) 163 164 self.handle_data("[%d]" % len(self.anchorlist)) 165 self.anchor = None 166 pass 167 pass 168 # 169 def do_base(self, attrs): 170 for a, v in attrs: 171 if a == 'href': 172 self.base = v 173 pass 174 pass 175 pass 176 # 177 def do_link(self, attrs): 178 href = others = '' 179 for a, v in attrs: 180 if a == 'href': href = v 181 else: others = others + ',' + v 182 pass 183 if others == '': others = ',' 184 if href: 185 self.print_out([self.in_file, href, others[1:]]) 186 pass 187 # ------------------------------------------------------- 188 189 def shorten_abspath(self, path): 190 common_root = '/www/public_html/' 191 if path[:len(common_root)] == common_root: 192 return path[len(common_root)-1:] 193 else: return path 194 pass 195 196 pass 197 198 199 def my_program(args_passed = None): 200 201 # This is derived from "def test" in 202 # file "/usr/lib/python1.5/sgmllib.py". 203 204 import sys 205 206 # Example in /usr/lib/python1.5/sgmllib.py is for testing only, and 207 # example in /usr/lib/python1.5/htmllib.py is for testing only. 208 # They don't handle multiple input files; this one does. 209 import getopt 210 211 options = 's' 212 if args_passed: 213 optlist, args = getopt.getopt(args_passed, options) 214 pass 215 else: 216 optlist, args = getopt.getopt(sys.argv[1:], options) 217 pass 218 219 if args == []: args = ['-'] 220 221 for file in args: 222 if file == '-': 223 f = sys.stdin 224 pass 225 else: 226 try: 227 f = open(file, 'r') 228 pass 229 except IOError, msg: 230 print file, ":", msg 231 sys.exit(1) 232 pass 233 pass 234 235 data = f.read() 236 # 237 if file is not '-': 238 f.close() 239 pass 240 241 klass = MySGMLParser 242 x = klass(file) 243 # 244 for c in data: 245 x.feed(c) 246 pass 247 x.close() 248 pass 249 pass 250 251 my_program() 252 253 ### 254 # |