1 #! /usr/bin/env python 2 # Time-stamp: <2001-10-17 22:05:33 cymbala> 3 4 5 # SYNTAX: python noncurlq.py FILENAME 6 # ==================================== 7 8 # Here's an example: 9 # 10 # cymbala@debian:~$ python /floppy/noncurlq.py 04.htm 11 # non-curly quote: 'industrialists"[71]: \012 ' 12 # cymbala@debian:~$ 13 # ======================================================= 14 15 16 # Why? 17 18 # FIND straight (non-curly) quotes in HTML data (such as 19 # words quoted by the author)... 20 # 21 # ... while IGNORING straight quotes that are part of 22 # mark-up (such as HREF values that are quoted inside an A 23 # anchor). 24 25 26 # Once located, you may manually change '"' to either 27 # '“' or '”' depending on whether the straight 28 # quote is an opener or a closer. Likewise for "'" (and its 29 # corresponding numeric values for open-left-single-quote 30 # and open-right-single-quote. 31 32 33 # Ideally, one would use "“" "‘" 34 # "”" "”" 35 # ...instead of numeric values. 36 # Symbols are better (more portable) than numbers. 37 38 39 # Besides finding straight quotes, this script is an example 40 # of how to use Python's sgmllib (SGML library) to look for 41 # data inside an HTML file. It's as simple as possible. In 42 # fact, most of this script was COPIED from sgmllib.py (located in 43 # /usr/lib/python1.5). All I had to do was make a few changes based 44 # on a general understanding of how it works. 45 46 47 # NOTE OF WARNING: Due to ``file = args[0]'' this script can 48 # only process one file at a time. This is dangerous, 49 # because some people might ignore SYNTAX found above and 50 # specify multiple file names for input... they would think 51 # all files are being processed, whereas only the first 52 # would be processed!!!!! And that would be bad (or "evil" 53 # to use a popular Bushism). 54 55 56 57 58 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 59 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 60 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 62 63 from sgmllib import * 64 import sys 65 66 67 # Copied from: /usr/lib/python1.5/sgmllib.py 68 # (Added "pass" statements.) 69 # ------------------------------------------------------- 70 def test(args = None): 71 import sys 72 73 if not args: 74 args = sys.argv[1:] 75 pass 76 77 if args and args[0] == '-s': 78 args = args[1:] 79 klass = SGMLParser 80 pass 81 else: 82 klass = TestSGMLParser 83 pass 84 85 if args: 86 file = args[0] 87 pass 88 else: 89 file = 'test.html' 90 pass 91 92 if file == '-': 93 f = sys.stdin 94 pass 95 else: 96 try: 97 f = open(file, 'r') 98 pass 99 except IOError, msg: 100 print file, ":", msg 101 sys.exit(1) 102 pass 103 pass 104 105 data = f.read() 106 if f is not sys.stdin: 107 f.close() 108 pass 109 110 x = klass() 111 for c in data: 112 x.feed(c) 113 pass 114 x.close() 115 pass 116 # ------------------------------------------------------- 117 118 119 120 121 # Copied from: /usr/lib/python1.5/sgmllib.py 122 # (Added "pass" statements.) 123 # (Commented most, if not all, print statements.) 124 # 125 # (Added condition to look for '"' or '"' in data.) <--- ! 126 # 127 # ======================================================= 128 class TestSGMLParser(SGMLParser): 129 130 def __init__(self, verbose=0): 131 self.testdata = "" 132 SGMLParser.__init__(self, verbose) 133 pass 134 135 def handle_data(self, data): 136 self.testdata = self.testdata + data 137 if len(`self.testdata`) >= 70: 138 self.flush() 139 pass 140 pass 141 142 def flush(self): 143 data = self.testdata 144 if data: 145 self.testdata = "" 146 # print 'data:', `data` 147 148 # 2001.10.12 -RjC 149 # 150 if ((string.find(data, '"') > -1) or (string.find(data, "'") > -1)): 151 print 'non-curly quote:', `data` 152 pass 153 154 pass 155 pass 156 157 def handle_comment(self, data): 158 self.flush() 159 r = `data` 160 if len(r) > 68: 161 r = r[:32] + '...' + r[-32:] 162 pass 163 # print 'comment:', r 164 pass 165 166 def unknown_starttag(self, tag, attrs): 167 self.flush() 168 if not attrs: 169 # print 'start tag: <' + tag + '>' 170 pass 171 else: 172 # print 'start tag: <' + tag, 173 for name, value in attrs: 174 # print name + '=' + '"' + value + '"', 175 pass 176 # print '>' 177 pass 178 pass 179 180 def unknown_endtag(self, tag): 181 self.flush() 182 # print 'end tag: </' + tag + '>' 183 pass 184 185 def unknown_entityref(self, ref): 186 self.flush() 187 # print '*** unknown entity ref: &' + ref + ';' 188 pass 189 190 def unknown_charref(self, ref): 191 self.flush() 192 # print '*** unknown char ref: &#' + ref + ';' 193 pass 194 195 def close(self): 196 SGMLParser.close(self) 197 self.flush() 198 pass 199 # ======================================================= 200 201 202 203 test() 204 205 ### 206 # |