1    #! /usr/bin/env python
       2    # Time-stamp: <2001-10-17 22:05:33 cymbala>
       3    
       4    
       5    # SYNTAX:  python noncurlq.py FILENAME
       6    # ====================================
       7    
       8    # Here's an example:
       9    #
      10    # cymbala@debian:~$ python /floppy/noncurlq.py 04.htm
      11    # non-curly quote: 'industrialists"[71]:  \012    '
      12    # cymbala@debian:~$
      13    # =======================================================
      14    
      15    
      16    # Why?
      17    
      18    # FIND straight (non-curly) quotes in HTML data (such as
      19    # words quoted by the author)...
      20    #
      21    # ... while IGNORING straight quotes that are part of
      22    # mark-up (such as HREF values that are quoted inside an A
      23    # anchor).
      24    
      25    
      26    # Once located, you may manually change '"' to either
      27    # '&#8220;' or '&#8221;' depending on whether the straight
      28    # quote is an opener or a closer.  Likewise for "'" (and its
      29    # corresponding numeric values for open-left-single-quote
      30    # and                              open-right-single-quote.
      31    
      32    
      33    # Ideally, one would use "&ldquo;"   "&lsquo;"
      34    #                        "&rdquo;"   "&rdquo;"
      35    # ...instead of numeric values.
      36    # Symbols are better (more portable) than numbers.
      37    
      38    
      39    # Besides finding straight quotes, this script is an example
      40    # of how to use Python's sgmllib (SGML library) to look for
      41    # data inside an HTML file.  It's as simple as possible.  In
      42    # fact, most of this script was COPIED from sgmllib.py (located in
      43    # /usr/lib/python1.5).  All I had to do was make a few changes based
      44    # on a general understanding of how it works.
      45    
      46    
      47    # NOTE OF WARNING: Due to ``file = args[0]'' this script can
      48    # only process one file at a time.  This is dangerous,
      49    # because some people might ignore SYNTAX found above and
      50    # specify multiple file names for input... they would think
      51    # all files are being processed, whereas only the first
      52    # would be processed!!!!!  And that would be bad (or "evil"
      53    # to use a popular Bushism).
      54    
      55    
      56    
      57    
      58    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      59    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      60    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
      61    
      62    
      63    from sgmllib import *
      64    import sys
      65    
      66    
      67    # Copied from: /usr/lib/python1.5/sgmllib.py
      68    # (Added "pass" statements.)
      69    # -------------------------------------------------------
      70    def test(args = None):
      71        import sys
      72    
      73        if not args:
      74            args = sys.argv[1:]
      75            pass
      76    
      77        if args and args[0] == '-s':
      78            args = args[1:]
      79            klass = SGMLParser
      80            pass
      81        else:
      82            klass = TestSGMLParser
      83            pass
      84    
      85        if args:
      86            file = args[0]
      87            pass
      88        else:
      89            file = 'test.html'
      90            pass
      91    
      92        if file == '-':
      93            f = sys.stdin
      94            pass
      95        else:
      96            try:
      97                f = open(file, 'r')
      98                pass
      99            except IOError, msg:
     100                print file, ":", msg
     101                sys.exit(1)
     102                pass
     103            pass
     104    
     105        data = f.read()
     106        if f is not sys.stdin:
     107            f.close()
     108            pass
     109    
     110        x = klass()
     111        for c in data:
     112            x.feed(c)
     113            pass
     114        x.close()
     115        pass
     116    # -------------------------------------------------------
     117    
     118    
     119    
     120    
     121    # Copied from: /usr/lib/python1.5/sgmllib.py
     122    #   (Added "pass" statements.)
     123    #   (Commented most, if not all, print statements.)
     124    #
     125    #   (Added condition to look for '"' or '"' in data.)    <--- !
     126    #
     127    # =======================================================
     128    class TestSGMLParser(SGMLParser):
     129    
     130        def __init__(self, verbose=0):
     131            self.testdata = ""
     132            SGMLParser.__init__(self, verbose)
     133            pass
     134    
     135        def handle_data(self, data):
     136            self.testdata = self.testdata + data
     137            if len(`self.testdata`) >= 70:
     138                self.flush()
     139                pass
     140            pass
     141    
     142        def flush(self):
     143            data = self.testdata
     144            if data:
     145                self.testdata = ""
     146                # print 'data:', `data`
     147    
     148                # 2001.10.12 -RjC
     149                #
     150                if ((string.find(data, '"') > -1) or (string.find(data, "'") > -1)):
     151                    print 'non-curly quote:', `data`
     152                    pass
     153                
     154                pass
     155            pass
     156    
     157        def handle_comment(self, data):
     158            self.flush()
     159            r = `data`
     160            if len(r) > 68:
     161                r = r[:32] + '...' + r[-32:]
     162                pass
     163            # print 'comment:', r
     164            pass
     165    
     166        def unknown_starttag(self, tag, attrs):
     167            self.flush()
     168            if not attrs:
     169                # print 'start tag: <' + tag + '>'
     170                pass
     171            else:
     172                # print 'start tag: <' + tag,
     173                for name, value in attrs:
     174                    # print name + '=' + '"' + value + '"',
     175                    pass
     176                # print '>'
     177                pass
     178            pass
     179    
     180        def unknown_endtag(self, tag):
     181            self.flush()
     182            # print 'end tag: </' + tag + '>'
     183            pass
     184    
     185        def unknown_entityref(self, ref):
     186            self.flush()
     187            # print '*** unknown entity ref: &' + ref + ';'
     188            pass
     189    
     190        def unknown_charref(self, ref):
     191            self.flush()
     192            # print '*** unknown char ref: &#' + ref + ';'
     193            pass
     194    
     195        def close(self):
     196            SGMLParser.close(self)
     197            self.flush()
     198            pass
     199    # =======================================================
     200    
     201    
     202    
     203    test()
     204    
     205    ###
     206    #