1    #!/usr/bin/env python
       2    # Time-stamp: <2003-05-15 05:16:10 cymbala>
       3    #
       4    # For more information about this script, see:
       5    #   <URL: http://www.lafn.org/~cymbala/Lia/lia_qual.html>
       6    
       7    # SYNTAX: python lia_1st_.py [-n y | --finish_para=y] [--begin_para=x] 
       8    # ------------------------------------------------------------------
       9    
      10    
      11    info = {}
      12    info['title'] = 'LENIN COLLECTED WORKS'
      13    info['py_name'] = 'lia_1st_'
      14    info['debug'] = 0
      15    info['debug_echo_htm'] = 0
      16    info['v'] = '01'
      17    info['v'] = '02'
      18    info['v'] = '03'
      19    info['v'] = '14'
      20    info['v'] = '22'
      21    info['v'] = '23'
      22    info['v'] = '24'
      23    info['v'] = '25'
      24    info['v'] = '26'
      25    info['v'] = '27'
      26    info['v'] = '28'
      27    info['v'] = '29'
      28    info['v'] = '30'
      29    info['begin_para'] = 1
      30    info['finish_para'] = 10000
      31    
      32    
      33    from sgmllib import SGMLParser
      34    import fileinput, formatter
      35    import getopt
      36    import os
      37    import popen2
      38    import re
      39    import string, sys
      40    import tempfile, time, types
      41    
      42    
      43    class Share:
      44        """___"""
      45    
      46        def __init__compile_regexps__(self):
      47            self.re = {}
      48            self.re['comment'] = re.compile('^[\s]*#')
      49            self.re['letter'] = re.compile('[a-z]', re.I)
      50            self.re['special_emdash'] = re.compile("—")
      51            self.re['special_rsquo'] = re.compile("’") # Try C-r ' on that one!
      52            self.re['special_ldquo'] = re.compile("“")
      53            self.re['special_rdquo'] = re.compile("”")
      54            self.re['white_spaces_leading'] = re.compile('^\s+')
      55            self.re['newline'] = re.compile('\n')
      56            self.re['nonbreaking_spaces_ref'] = re.compile('(&#160;?)')
      57            self.re['comment_volpage'] = re.compile(
      58                'v[^0-9]*([0-9]+)[^p]*p[^0-9]*([0-9]+)', re.I)
      59            #
      60            self.re['numbers'] = re.compile('^([0-9]+)$')
      61            self.re['ast_two_numbers'] = re.compile('[^0-9]*([0-9]+)[^0-9]+([0-9]+)')
      62    
      63            pass
      64    
      65        def __init__(self, info):
      66            self.__init__compile_regexps__()
      67    
      68            self.info = info
      69    
      70            optlist, args = getopt.getopt(sys.argv[1:], 'd:n:v:',
      71                                          ['finish_para=', 'volume='
      72                                           'begin_para=', 'debug='])
      73            # If -n specified set begin_para to 1 unless --begin_para also specified.
      74            spam = None
      75            for i in optlist:
      76                if i[0] == '-d' or i[0] == '--debug':
      77                    self.info['debug'] = int(i[1]) # int() very important!
      78                    pass
      79                elif i[0] == '-v' or i[0] == '--volume':
      80                    self.info['v'] = i[1]
      81                    pass
      82                elif i[0] == '-n' or i[0] == '--finish_para':
      83                    if not spam: self.info['begin_para'] = 1
      84                    self.info['finish_para'] = i[1]
      85                    pass
      86                elif i[0] == '--begin_para':
      87                    self.info['begin_para'] = i[1]
      88                    spam = 't'
      89                    pass
      90                print i
      91                print "zzzz" + str(self.info['debug']) + "yyy" + str(i[1])
      92                pass
      93            self.info['begin_para'] = int(self.info['begin_para'])
      94            self.info['finish_para'] = int(self.info['finish_para'])
      95    
      96            self.info['input_actual'] = '~/www.marxists.org/archive/lenin/works/cw/volume'+ self.info['v'] +'.htm'
      97            self.info['input_expected'] = '~/www.marxists.org/archive/lenin/works/cw/lia_1st_'+ self.info['v'] +'.txt'
      98            
      99            
     100            self.debug = self.info['debug']
     101            print str(self.debug) + "!"
     102            self.debug_echo_htm = self.info['debug_echo_htm']
     103            
     104            self.tempfile = tempfile.mktemp()
     105            self.report_filename = self.create_rptname(0)
     106            self.reportobj_w = open(self.report_filename, 'w')
     107            # if share.debug: self.prt('\ntemporary file: ' + self.tempfile)
     108            self.tempfiles = []
     109    
     110            self.begin_para = self.info['begin_para']
     111            self.finish_para = self.info['finish_para']
     112            
     113            self.charref_four = {}
     114            self.charref_four['147'] = '"'
     115            self.charref_four['148'] = '"'
     116            self.charref_four['8212'] = "--"
     117            self.charref_four['8217'] = "'"
     118            self.charref_four['8220'] = '"'
     119            self.charref_four['8221'] = '"'
     120    
     121            self.first_lines_max_chars = 80
     122    
     123            pass
     124    
     125        def create_rptname(self, n):
     126            filename = self.tempfile + '_' + self.info['py_name']
     127            filename = filename + ('%02d' % n) + '.rpt'
     128            return filename
     129            pass
     130    
     131        def os_path_join_norm(self, base, anchor):
     132            """___"""
     133            #
     134            return os.path.expanduser(os.path.normpath(os.path.join(base, anchor)))
     135    
     136    
     137        def expanduser(self, name):
     138            """___"""
     139            # Return with user expanded.
     140            name = os.path.expanduser(name)
     141            return name
     142            pass
     143    
     144    
     145        def filename(self, filename):
     146            # Return a filename.
     147            filename = self.expanduser(filename)
     148            if not os.path.isfile(filename):
     149                raise 'Not an existing regular file: ' + filename
     150                pass
     151            if share.debug: share.prt('\n*** filename ***  ' + filename)
     152            return filename
     153            pass
     154    
     155    
     156        def fileobj_w(self, filename):
     157            # Return a file object for writing.
     158            self.tempfiles.append(filename)
     159            return open(filename, 'w')
     160    
     161    
     162        def row_id_return(self, v, t, l):
     163            return string.join(['%03d' % v,
     164                                '%03d' % t,
     165                                '%04d' % l,
     166                                ], '.')
     167            pass
     168    
     169    
     170        def diff(self):
     171            filearg1 = expected.fileobj_name
     172            filearg2 = actual.fileobj_name
     173            cmd = 'diff   ' + filearg1
     174            cmd = cmd + ' ' + filearg2
     175    
     176            child_stdout, child_stdin = popen2.popen2(cmd)
     177            share.prt('\nRESULTS:')
     178            share.prt('< ' + filearg1 + '')
     179            share.prt('> ' + filearg2 + '\n')
     180            share.prt(child_stdout.read())
     181            
     182            child_stdin.close()
     183            child_stdout.close()
     184            pass
     185    
     186    
     187        def prt(self, message):
     188            print message
     189            self.reportobj_w.write(message + '\n')
     190            pass
     191    
     192    
     193        def bye_bye(self):
     194            # Remove temporary files.
     195            if not share.debug:
     196                for filename in self.tempfiles:
     197                    os.remove(filename)
     198                    pass
     199                pass
     200            
     201            share.prt('\nDone.')
     202    
     203            # To find files that aren't link, print names of files parsed.
     204            share.prt('')
     205            for i in actual.seen_these_before:
     206                # Delete leading "/home/cymbala/www.marxists.org/"
     207                match_obj = re.compile('.*\.org/', re.I).match(i)
     208                if match_obj: share.prt(i[match_obj.end(0):])
     209                else: share.prt(i)
     210                pass
     211            share.prt('')
     212    
     213            self.reportobj_w.close()
     214    
     215            filename = self.create_rptname(self.volume)
     216            os.rename(self.report_filename, filename)
     217            pass
     218    
     219    
     220    class Expected:
     221        """___"""
     222    
     223        def __init__compile_regexps__(self):
     224            self.re = {}
     225            self.re['comment_first_lines'] = re.compile('^([^ 	]+[ 	]*)?#')
     226            
     227            # 000: 001-045
     228            self.re['volume_range'] = re.compile(
     229                '^000: ([0-9]+)-([0-9]+)$')
     230            # 001.000: 001-004
     231            self.re['text_range'] = re.compile(
     232                '^([0-9]{3})\.000: ([0-9]+)-([0-9]+)$')
     233    
     234            self.triad = '^([0-9]{3})\.([0-9]{3})\.([0-9]{4})'
     235            
     236            # 001.001.0000:  (+ 0 123) 123
     237            self.re['line_range'] = re.compile(
     238                self.triad + ':?[ 	]+\(\+ ([0-9]+) ([0-9]+)\) ([0-9]+)$')
     239            # 001.001.0000  () Title
     240            self.re['title'] = re.compile(
     241                self.triad + '[ ][ ]\(\)[ 	]*(.*)$')
     242            # 001.001.0000 not_space_or_hash
     243            self.re['first_line'] = re.compile(
     244                self.triad + '[ ]([^ ].*)$')
     245            
     246            #self.re[''] = re.compile()
     247            pass
     248    
     249        def __init__(self):
     250            self.__init__compile_regexps__()
     251    
     252            self.instance_of = 'expected'
     253    
     254            self.line_lengths = {}
     255    
     256            pass
     257    
     258        def __call__(self):
     259            share.para_cumulative = 0
     260            
     261            return self.read_input()
     262            pass
     263    
     264    
     265        def para_counter_reset(self):
     266            self.line = 0
     267            pass
     268        def text_counter_reset(self):
     269            self.text = 0
     270            pass
     271        def para_counter_increment(self):
     272            if self.increment_para_counter_flag:
     273                self.line = self.line + 1
     274                share.para_cumulative = share.para_cumulative + 1
     275                pass
     276            pass
     277        def text_counter_increment(self):
     278            self.test_end_of_text()
     279            #
     280            if self.increment_text_counter_flag:
     281                self.text = self.text + 1
     282                if share.debug: share.prt('TEXT_COUNTER_INCREMENT: ' + str(self.text))
     283                pass
     284            #
     285            self.increment_text_counter_flag = 0
     286            pass
     287    
     288        def test_end_of_text(self):
     289    
     290            # Between two texts, end of previous text is marked by:
     291            # 003.002.0000  () Uncritical Criticism
     292            # 003.002.0000:  (+ 0 27) 27
     293            #
     294            # or:
     295            # 003.002.0000:  (+ 0 27) 27
     296            # 003.002.0000  () Uncritical Criticism
     297            #
     298            # ...or not, if file is sorted...
     299            #
     300            # ...so, this test happens during new title AND during
     301            # new line range, WHICHEVER COMES FIRST.
     302    
     303            if self.test_end_of_text_flag:
     304                if not self.line == self.line_end:
     305                    a = str(self.line)
     306                    b = str(self.line_end)
     307                    z = 'Text ' + str(self.text) + ': '
     308                    raise z + 'Found ' + a + ' paragraphs, expected ' + b
     309                pass
     310            self.test_end_of_text_flag = 0
     311            pass
     312    
     313        def read_input(self):
     314            # Read expected data.
     315            filename = share.filename(
     316                share.info['input_' + string.lower(self.instance_of)])
     317    
     318            self.fileobj_name = share.tempfile + '_' + self.instance_of + '_data'
     319            self.fileobj_w = share.fileobj_w(self.fileobj_name)
     320            self.found_volume_range = 0
     321            self.found_text_range = 0
     322            self.volume_min = self.text_min = 2
     323            self.volume_max = self.text_max = 0
     324    
     325            self.test_end_of_text_flag = 0
     326            self.increment_text_counter_flag = 1
     327            self.increment_para_counter_flag = 1
     328            self.text_counter_reset()
     329            
     330            for line in fileinput.input(filename):
     331    
     332                if self.re['comment_first_lines'].match(line):
     333                    continue
     334    
     335                self.fall_through = 0
     336    
     337                # 000: 001-045
     338                match_obj = self.re['volume_range'].match(line)
     339                if match_obj and not self.fall_through:
     340                    self.fall_through = 1
     341    
     342                    self.found_volume_range = 1
     343                    self.volume_min = int(match_obj.group(1))
     344                    self.volume_max = int(match_obj.group(2))
     345                    
     346                    if share.debug:
     347                        share.prt('Volume Start : ' + str(self.volume_min))
     348                        pass
     349                    if share.debug:
     350                        share.prt('Volume End   : ' + str(self.volume_max))
     351                        pass
     352    
     353                    pass
     354    
     355                # 001.000: 001-004
     356                match_obj = self.re['text_range'].match(line)
     357                if match_obj and not self.fall_through:
     358                    self.fall_through = 1
     359    
     360                    self.found_text_range = 1
     361                    self.volume = int(match_obj.group(1))
     362                    self.text_min = int(match_obj.group(2))
     363                    self.text_max = int(match_obj.group(3))
     364    
     365                    share.prt('\nVOLUME: ' + str(self.volume))
     366    
     367                    if share.debug:
     368                        share.prt('VOLUME ------> ' + str(self.volume))
     369                        pass
     370                    if share.debug:
     371                        share.prt('Text Start   : ' + str(self.text_min))
     372                        pass
     373                    if share.debug:
     374                        share.prt('Text End     : ' + str(self.text_max))
     375                        pass
     376    
     377                    pass
     378    
     379                # 001.001.0000: (+ 0 123) 123
     380                match_obj = self.re['line_range'].match(line)
     381                if match_obj and not self.fall_through:
     382                    self.fall_through = 1
     383    
     384                    if 0 == int(match_obj.group(3)):
     385    
     386                        self.text_counter_increment()
     387                        
     388                        self.line_start = int(match_obj.group(5))
     389                        self.line_end = int(match_obj.group(6))
     390    
     391                        if not self.line_start == self.line_end:
     392                            raise 'Invalid format: ' + line
     393                        else:
     394                            self.line_start = 1
     395                            pass
     396    
     397                        if share.debug:
     398                            share.prt('TEXT   ------> ' + str(self.text))
     399                            pass
     400                        if share.debug:
     401                            share.prt('Line Start   : ' + str(self.line_start))
     402                            pass
     403                        if share.debug:
     404                            share.prt('Line End     : ' + str(self.line_end))
     405                            pass
     406    
     407                        expected.para_counter_reset()
     408                        
     409                        pass
     410    
     411                    if not self.volume == int(match_obj.group(1)):
     412                        raise str(self.volume) + ': Unexpected volume number: ' + line
     413                    elif not self.text == int(match_obj.group(2)):
     414                        raise str(self.text) + ': Unexpected text number: ' + line
     415                    
     416                    pass
     417    
     418                # 001.001.0001  () The Development of
     419                match_obj = self.re['title'].match(line)
     420                if match_obj and not self.fall_through:
     421                    self.fall_through = 1
     422    
     423                    if 0 == int(match_obj.group(3)):
     424                        self.text_counter_increment()
     425                        expected.para_counter_reset()
     426                        self.title = match_obj.group(4)
     427                        share.prt('\nTitle (' + str(self.text) + '): ' + self.title)
     428                        pass
     429                    else:
     430                        self.subtitle = match_obj.group(4)
     431                        if share.debug:
     432                            if self.subtitle:
     433                                share.prt('     subtitle: ' + self.subtitle)
     434                                pass
     435                            pass
     436                        pass
     437    
     438                    pass
     439    
     440                # 001.001.0001 Cover of the first edition of
     441                match_obj = self.re['first_line'].match(line)
     442                if match_obj and not self.fall_through:
     443                    self.fall_through = 1
     444    
     445                    self.para_counter_increment()
     446    
     447                    if self.line == 1:
     448                        self.increment_text_counter_flag = 1
     449                        self.test_end_of_text_flag = 1
     450                        pass
     451                    
     452                    if not self.volume == int(match_obj.group(1)):
     453                        raise str(self.volume) + ': Unexpected volume number: ' + line
     454                    elif not self.text == int(match_obj.group(2)):
     455                        raise str(self.text) + ': Unexpected text number: ' + line
     456                    elif not self.line == int(match_obj.group(3)):
     457                        raise str(self.line) + ': Unexpected line number: ' + line
     458    
     459                    if (share.para_cumulative >= share.begin_para) and \
                       (share.para_cumulative <= share.finish_para):
     461                        row_id = share.row_id_return(expected.volume,
     462                                                     expected.text,
     463                                                     expected.line)
     464                        self.line_lengths[row_id] = len(match_obj.group(4))
     465    
     466                        self.fileobj_w.write(line)
     467                        pass
     468                    pass
     469    
     470                if not self.fall_through:
     471                    raise 'Line not trapped: ' + line
     472                    pass
     473    
     474                pass
     475    
     476            self.test_end_of_text()
     477    
     478            if not self.found_volume_range:
     479                raise 'Did not find volume range in: ' + filename
     480            elif not self.found_text_range:
     481                raise 'Did not find text range in: ' + filename
     482            elif not self.text == self.text_max:
     483                a = str(self.text)
     484                b = str(self.text_max)
     485                raise 'Found ' + a + ' texts, expected ' + b
     486            
     487            self.fileobj_w.close()
     488            return self.text_max
     489        pass
     490    
     491    
     492    
     493    class Actual:
     494        """___"""
     495    
     496        def __init__compile_regexps__(self):
     497            self.re = {}
     498            self.re['dot_htm'] = re.compile('[.]html?$', re.I)
     499            self.re['volume_htm'] = re.compile('/volume([^.]+)[.]html?$', re.I)
     500            self.re['index_htm'] = re.compile('/index[.]html?$', re.I)
     501    
     502            pass
     503    
     504        def __init__(self):
     505            self.__init__compile_regexps__()
     506    
     507            self.instance_of = 'actual'
     508    
     509            self.filename_stack = []
     510            self.seen_these_before = []
     511    
     512            share.info['ignore_tags_first_line'] = ['a',
     513                                                    'em',
     514                                                    'span',
     515                                                    'sup']
     516            self.switch_by_level = None
     517    
     518            share.info['ignore_with_attrs_of_'] = {}
     519            # Not: quote, sig.
     520            # Not sure: indexa.
     521            share.info['ignore_with_attrs_of_']['p'] = (('class', 'footer'),
     522                                                        ('class', 'indent'),
     523                                                        ('class', 'index'),
     524                                                        ('class', 'indexa'),
     525                                                        ('class', 'indexb'),
     526                                                        ('class', 'indexc'),
     527                                                        ('class', 'index-list'),
     528                                                        ('class', 'information'),
     529                                                        ('class', 'head'),
     530                                                        ('class', 'next'),
     531                                                        ('class', 'pagenote'),
     532                                                        ('class', 'pagenoteb'),
     533                                                        ('class', 'sub'),
     534                                                        ('class', 'toc'))
     535            share.info['ignore_with_attrs_of_']['blockquote'] = (('class', 'abcdefg'),
     536                                                                 ('class', 'lmnopqr'))
     537            # ('class', 'quote'),
     538    
     539            pass
     540    
     541    
     542        def custom_io(self, action, filename):
     543            """___"""
     544            #
     545            if action == 'open':
     546                # Allows opening of a 2nd file before end of 1st file.
     547                #
     548                # Change to urllib.urlopen ...
     549                
     550                return_list = []
     551                for line in fileinput.input(filename):
     552                    return_list.append(line)
     553                    pass
     554                fileinput.close()
     555                self.filename_stack.append(filename)
     556                return return_list
     557            elif action == 'close':
     558                del self.filename_stack[-1]
     559                return None
     560            else: raise 'Unknown action: ' + action
     561    
     562    
     563        def actual_fileinput(self, filename):
     564            """___"""
     565    
     566            if share.debug:
     567                share.prt('*** ACTUAL_FILEINPUT *** Actual :: ' + filename)
     568                pass
     569            if actual.text > share.text_max:
     570                if share.debug: share.prt('TEXT_NOT_IN_EXPECTED: ' + filename)
     571                return
     572    
     573            if actual.level == 1:
     574                actual.switch_by_level = 't'
     575                actual.text_counter_increment()
     576                actual.para_counter_reset()
     577                pass
     578                    
     579            # SEE:  "test" in /usr/lib/python1.5/htmllib.py for example.
     580            #      formatter.NullFormatter()
     581            #      formatter.AbstractFormatter(formatter.DumbWriter())
     582    
     583            if share.debug: share.prt('\nCREATING PARSER...')
     584            parser = ACTUALParser(formatter.NullFormatter())
     585            parser.chunk_ignore_switch = None
     586            # -- parser.tag_ignore_switch = None
     587    
     588            parser.filename_of_source_data = filename
     589            parser.basename_of_source_data = os.path.basename(filename)
     590    
     591            # Required for actual since order does not matter.
     592            self.row_id_previous = None
     593    
     594            lines_list = actual.custom_io('open', filename)
     595    
     596            # For reporting unbalanced tags and out-of-sequence volpage.
     597            self.source_filename = filename
     598            
     599            for i in range(len(lines_list)):
     600    
     601                # For reporting unbalanced tags.
     602                self.unbalanced_i = i
     603    
     604                line = lines_list[i]
     605                if share.debug_echo_htm: share.prt(
     606                    parser.basename_of_source_data + ' >' + line[:-1])
     607    
     608                # Non-breaking space:
     609                if share.re['nonbreaking_spaces_ref'].search(line):
     610                    line = share.re['nonbreaking_spaces_ref'].sub(' ', line)
     611                    pass
     612                
     613                parser.line = line
     614                parser.feed(line)
     615                pass
     616            parser.close()
     617            actual.custom_io('close', None)
     618            return 0
     619    
     620    
     621        def __call__(self):
     622            share.para_cumulative = 0
     623            
     624            filename = share.filename(
     625                share.info['input_' + string.lower(self.instance_of)])
     626    
     627            self.level = 0
     628    
     629            match_obj = actual.re['volume_htm'].search(filename)
     630            if match_obj:
     631    
     632                self.volume = share.volume = int(match_obj.group(1))
     633                if not actual.volume == expected.volume:
     634                    raise 'Volumes different: expected=' + str(
     635                        expected.volume) + ' actual=' + str(
     636                        actual.volume)
     637                    pass
     638                pass
     639            else:
     640                raise filename + ' failed: ' + actual.re['volume_htm'].pattern
     641    
     642            self.fileobj_name = share.tempfile + '_' + self.instance_of + '_data'
     643            self.fileobj_w = share.fileobj_w(self.fileobj_name)
     644            self.read_input(filename)
     645            self.fileobj_w.close()
     646            
     647            pass
     648    
     649    
     650        def para_counter_reset(self):
     651            self.line = 0
     652            pass
     653        def text_counter_reset(self):
     654            self.text = 0
     655            pass
     656        def para_counter_increment(self):
     657            self.line = self.line + 1
     658            share.para_cumulative = share.para_cumulative + 1
     659            pass
     660        def text_counter_increment(self):
     661            self.text = self.text + 1
     662            if share.debug: share.prt('TEXT_COUNTER_INCREMENT: ' + str(self.text))
     663            pass
     664    
     665    
     666        def read_input(self, filename):
     667            self.text_counter_reset()
     668            self.actual_fileinput(filename)
     669            pass
     670        
     671        
     672        def anchors_to_exclude(self, url):
     673            """___"""
     674    
     675            paths = string.split(url, '/')
     676            # [ ... 'archive', 'lenin', 'works', 'cw', 'volume01.htm']
     677            switch = None
     678    
     679            # When reading volume99.htm file, remember dirname in
     680            # order to exclude files outside dirname's branch.
     681            level0_dirname = os.path.dirname(url)
     682            if share.debug:
     683                share.prt('\nINPUT: ' + url)
     684                share.prt('restricting to dir: ' + level0_dirname)
     685                pass
     686            if not actual.level0_dirname == url[:len(actual.level0_dirname)]:
     687                switch = 't'
     688                pass
     689    
     690            if url in actual.seen_these_before: switch = 't'
     691            else:
     692                for i in range(len(paths)):
     693                    # == /archive/
     694                    if paths[i] == 'archive':
     695                        if re.compile('^index[.]html?$').match(paths[-1]):
     696                            # == COLLECTED WORKS index
     697                            if paths[-2] == 'cw': switch = 't'
     698                            pass
     699                        # == Not enough pieces ("archive/lenin/works/index.htm")
     700                        if ((i + 4) > (len(paths) - 1)): switch = 't'
     701                        #
     702                        # == 'x' file (see:
     703                        # ==    marxists.org/admin/workshop/info/file-structure.txt
     704                        # elif paths[-1][:1] == 'x': switch = 't'
     705                        #
     706                        # == /archive/(not lenin)/
     707                        elif paths[i + 1] != 'lenin': switch = 't'
     708                        pass
     709                    # == /glossary/
     710                    elif paths[i] == 'admin': switch = 't'
     711                    elif paths[i] == 'glossary': switch = 't'
     712                    elif paths[i] == 'subject': switch = 't'
     713                    pass
     714                pass
     715            return switch
     716    
     717    
     718        def seen_it_before(self, candidate):
     719            """___"""
     720            #
     721            if not candidate in actual.seen_these_before:
     722                actual.seen_these_before.append(candidate)
     723                return None
     724            else:
     725                return candidate
     726            pass
     727        pass
     728    
     729    
     730    class ACTUALSGMLParser(SGMLParser):
     731        """Same as SGMLParser but report _where_ unbalanced tag exists"""
     732        
     733        # Example -- report an unbalanced </...> tag.
     734        def report_unbalanced(self, tag):
     735            if self.verbose:
     736                msg = '--- Line=' + str(actual.unbalanced_i)
     737                msg = msg + ' File=' + actual.source_filename
     738                share.prt(msg)
     739                share.prt('*-* Unbalanced </' + tag + '>')
     740                share.prt('*-* Stack: ' + `self.stack`)
     741                pass
     742            pass
     743        pass
     744    
     745    
     746    
     747    class ACTUALParser(ACTUALSGMLParser):
     748        """___"""
     749        #
     750        # Using SGMLParser instead of HTMLParser by incorporating
     751        # just a few pieces from HTMLParser here.
     752        
     753        #
     754        # Copied from TestSGMLParser in /usr/lib/python1.5/sgmllib.py.
     755        # ------------------------------------------------------------
     756        def __init__(self, verbose=0):
     757            self.actualdata = ''
     758            self.anchorlist = []
     759            self.volpage = {'v': 0, 'p': 0}
     760            self.base = None
     761            SGMLParser.__init__(self, verbose)
     762            self.first_line = ''
     763            self.first_line_switch = None
     764            pass
     765    
     766        def handle_data(self, data):
     767            #
     768            # -- if not self.tag_ignore_switch:
     769            if len(self.stack) > 0:
     770                # This "not 'span'" seems wrong ... 2002.04.03 ...
     771                if not 'span' in self.stack:
     772                    data = share.re['white_spaces_leading'].sub(' ', data)
     773                    data = share.re['special_emdash'].sub("--", data)
     774                    data = share.re['special_rsquo'].sub("'", data)
     775                    data = share.re['special_ldquo'].sub('"', data)
     776                    data = share.re['special_rdquo'].sub('"', data)
     777                    self.actualdata = self.actualdata + data
     778                    pass
     779                pass
     780            if len(`self.actualdata`) >= 70:
     781                self.flush()
     782    
     783        def flush(self):
     784            data = self.actualdata
     785            if data:
     786                #
     787                if self.first_line_switch:
     788                    if len(self.first_line) < share.first_lines_max_chars:
     789                        self.first_line = self.first_line + data
     790                    pass
     791                #
     792                # Defaults:
     793                self.actualdata = ''
     794                # print 'data:', `data`
     795                pass
     796            pass
     797    
     798        def volpage_check(self, mygroup):
     799            if self.volpage['v'] == 0:
     800                self.volpage['v'] = int(mygroup(1))
     801                self.volpage['p'] = int(mygroup(2))
     802                pass
     803            else:
     804                if self.volpage['v'] <> int(mygroup(1)):
     805                    share.prt('\nINVALID VOLUME in '
     806                              + actual.source_filename + ': '
     807                              + mygroup(0))
     808                    pass
     809                if (self.volpage['p'] + 1) <> int(mygroup(2)):
     810                    share.prt('\nINVALID PAGE in '
     811                              + actual.source_filename + ': '
     812                              + mygroup(0))
     813                    pass
     814                else:
     815                    self.volpage['p'] = int(mygroup(2))
     816                    pass
     817                pass
     818            pass
     819        def handle_comment(self, data):
     820            self.flush()
     821            if len(data) < 100:
     822                # EXAMPLE:
     823                # <!-- vol 07 page 34 -->
     824                match_obj = share.re['comment_volpage'].search(data)
     825                if match_obj:
     826                    self.volpage_check(match_obj.group)
     827                pass
     828            pass
     829    
     830        def unknown_starttag(self, tag, attrs):
     831            self.flush()
     832            pass
     833        def unknown_endtag(self, tag):
     834            self.flush()
     835            pass
     836        def unknown_entityref(self, ref):
     837            self.flush()
     838            pass
     839        def unknown_charref(self, ref):
     840            if ref in share.charref_four.keys(): char = share.charref_four[ref]
     841            else: char = '~'
     842            self.handle_data(char)
     843            pass
     844            
     845        def close(self):
     846            SGMLParser.close(self)
     847            pass
     848    
     849        # Overridable -- handle start tag
     850        def handle_starttag(self, tag, method, attrs):
     851            #
     852            if not tag in share.info['ignore_tags_first_line']:
     853                self.handle_first_line()
     854                self.first_line_switch = None
     855                pass
     856            #
     857            # Default definition:
     858            method(attrs)
     859            pass
     860        
     861        # Overridable -- handle end tag
     862        def handle_endtag(self, tag, method):
     863            #
     864            # Ignore everything starting with footnotes.
     865            if ((self.actualdata == 'Footnotes' or
     866                 self.actualdata == "Author's Footnotes" or
     867                 self.actualdata == 'Endnotes' or
     868                 self.actualdata == "Editor's Endnotes")
     869                and
     870                (tag == 'h2' or tag == 'h3' or tag == 'h4')):
     871                self.chunk_ignore_switch = 't'
     872                pass
     873            if not tag in share.info['ignore_tags_first_line']:
     874                self.handle_first_line()
     875                self.first_line_switch = None
     876                pass
     877            #
     878            # Default definition:
     879            method()
     880            pass
     881    
     882        
     883        # Do (ones that don't require a separate end tag).
     884        def do_hr(self, attrs):
     885            pass
     886        def do_img(self, attrs):
     887            if actual.switch_by_level:
     888                self.first_line_switch = 't'
     889                self.first_line = 'IMAGE '
     890    
     891                # Exclude heading (H2, H3,...) GIFs in "The State and Revolution"
     892                switch = 't'
     893                for tuple in attrs:
     894                    if string.upper(tuple[0]) == 'SRC':
     895                        if string.lower(tuple[1]) in ['pics/chpt1.gif',
     896                                                      'pics/chpt2.gif',
     897                                                      'pics/chpt3.gif',
     898                                                      'pics/chpt4.gif',
     899                                                      'pics/chpt5.gif',
     900                                                      'pics/chpt6.gif',
     901                                                      'pics/index.gif',
     902                                                      'pics/intro.gif',
     903                                                      'pics/manuscpt.gif',
     904                                                      'pics/name.gif',
     905                                                      'pics/postscpt.gif',
     906                                                      'pics/preface.gif']:
     907                            self.first_line_switch = None
     908                            self.first_line = ''
     909                            pass
     910                        pass
     911                    elif string.upper(tuple[0]) == 'ONMOUSEOUT':
     912                        if string.upper(tuple[1]) == 'T':
     913                            self.first_line_switch = None
     914                            self.first_line = ''
     915                            pass
     916                        pass
     917                    pass
     918                self.cull_first_line()
     919                pass
     920            pass
     921        def do_base(self, attrs):
     922            """___ htmllib.py"""
     923            for a, v in attrs:
     924                if a == 'href':
     925                    self.base = v
     926                    pass
     927                pass
     928            pass
     929    
     930        # Empty definitions to get tags into stack.
     931        def start_body(self, attrs):
     932            pass
     933        def start_center(self, attrs):
     934            pass
     935        def start_em(self, attrs):
     936            pass
     937        def start_head(self, attrs):
     938            pass
     939        def start_html(self, attrs):
     940            pass
     941        def start_h1(self, attrs):
     942            pass
     943        def end_h1(self):
     944            pass
     945        def start_h2(self, attrs):
     946            pass
     947        #
     948        def end_blockquote(self):
     949            pass
     950        def end_body(self):
     951            pass
     952        def end_h2(self):
     953            pass
     954        def start_h3(self, attrs):
     955            pass
     956        def end_h3(self):
     957            pass
     958        def start_h4(self, attrs):
     959            pass
     960        def end_h4(self):
     961            pass
     962        def start_h5(self, attrs):
     963            pass
     964        def end_h5(self):
     965            pass
     966        def start_h6(self, attrs):
     967            pass
     968        def end_h6(self):
     969            pass
     970        def end_p(self):
     971            pass
     972    
     973        # Blockquote's and p's will output first-lines.
     974        def start_output_wrapper(self, tag, attrs):
     975            switch = None
     976            if self.stack == ['html', 'body', tag]: switch = 't'
     977            elif self.stack == ['html', 'body', 'blockquote', tag]: switch = 't'
     978            #
     979            for attr in attrs:
     980                if attr in share.info['ignore_with_attrs_of_'][tag]: switch = None
     981                pass
     982            if (switch and
     983                actual.switch_by_level): self.first_line_switch = 't'
     984            pass
     985        def start_blockquote(self, attrs):
     986            self.start_output_wrapper('blockquote', attrs)
     987            pass
     988        def start_p(self, attrs):
     989            self.start_output_wrapper('p', attrs)
     990            pass
     991    
     992        def start_span(self, attrs):
     993            pass
     994        def start_style(self, attrs):
     995            pass
     996        def start_table(self, attrs):
     997            if not actual.re['index_htm'].search(self.filename_of_source_data):
     998                if actual.switch_by_level:
     999                    self.first_line_switch = 't'
    1000                    self.first_line = 'TABLE '
    1001                    self.cull_first_line()
    1002                    pass
    1003                pass
    1004            self.first_line_switch = None
    1005            pass
    1006        def start_title(self, attrs):
    1007            pass
    1008    
    1009        def start_a(self, attrs):
    1010            """___ from htmllib.py """
    1011            href = ''
    1012            name = ''
    1013            type = ''
    1014            onblur = ''
    1015            for attrname, value in attrs:
    1016                value = string.strip(value)
    1017                if attrname == 'href': href = value
    1018                elif attrname == 'name': name = value
    1019                elif attrname == 'type': type = string.lower(value)
    1020                elif attrname == 'onblur': onblur = 't'
    1021                pass
    1022            self.anchor_bgn(href, name, type, onblur)
    1023            pass
    1024        #
    1025        def end_a(self):
    1026            self.anchor_end()
    1027            pass
    1028        #
    1029        def anchor_bgn(self, href, name, type, onblur):
    1030            self.anchor = href
    1031            self.onblur = onblur
    1032            if self.anchor:
    1033                self.anchorlist.append(href)
    1034                pass
    1035            pass
    1036        #
    1037        def anchor_end(self):
    1038            if (self.anchor and (not self.onblur)):
    1039                # self.handle_data("[%d]" % len(self.anchorlist)) # RjC -2001.09.02
    1040                #
    1041                dict = {'tagname': 'a',
    1042                        'anchor': self.anchor,
    1043                        'data': self.actualdata
    1044                        }
    1045    
    1046                if self.base: dict['base'] = self.base
    1047                else: dict['base'] = os.path.dirname(actual.filename_stack[-1])
    1048    
    1049                filename = share.os_path_join_norm(dict['base'], dict['anchor'])
    1050                dict['absolute'] = filename
    1051                
    1052                self.handle_data_anchor(dict) # RjC -2001.09.02
    1053                #
    1054                self.anchor = None
    1055                pass
    1056            pass
    1057    
    1058    
    1059        def handle_data_anchor(self, dict):
    1060            """___"""
    1061            #
    1062            filename = dict['absolute']
    1063    
    1064            if filename in actual.seen_these_before:
    1065                if share.debug: share.prt('ALREADY SEEN: ' + filename)
    1066                return
    1067    
    1068            #
    1069            if 'data' in dict.keys():
    1070                # Remove newlines, redundant spaces, leading spaces, etc.
    1071                if dict['data']:
    1072                    dict['data'] = string.join(string.split(dict['data']))
    1073                    pass
    1074                pass
    1075            #
    1076            if actual.re['dot_htm'].search(filename):
    1077                #
    1078                # Exclude certain anchors, similar to wget's --no-parent but
    1079                # not quite because subdirectory `works/cw' is at the same level
    1080                # as children of the hierarchy such as `works/1906'
    1081                #
    1082    
    1083                if actual.level == 0:
    1084                    # When reading volume99.htm file, remember dirname in
    1085                    # order to exclude files outside dirname's branch.
    1086                    actual.level0_dirname = os.path.dirname(dict['absolute'])
    1087                    if share.debug:
    1088                        share.prt('\nINPUT: ' + dict['absolute'])
    1089                        share.prt('restricting to dir: ' + actual.level0_dirname)
    1090                        pass
    1091    
    1092                    # 2002.10.07: actual.text_counter_increment()
    1093                    actual.para_counter_reset()
    1094                    actual.switch_by_level = 't'
    1095                    pass
    1096    
    1097                if actual.anchors_to_exclude(filename):
    1098                    if share.debug: share.prt('ANCHORS_TO_EXCLUDE: ' + filename)
    1099                    return
    1100                if actual.seen_it_before(filename):
    1101                    if share.debug: share.prt('SEEN_IT_BEFORE: ' + filename)
    1102                    return
    1103    
    1104                if share.debug: share.prt('\nFOUND ANCHOR: ' + filename)
    1105                if share.debug: share.prt('\nDICT: ' + str(dict))
    1106                if share.debug: share.prt('-' * 77)
    1107                previous_switch_by_level = actual.switch_by_level
    1108                actual.level = actual.level + 1
    1109                actual.actual_fileinput(dict['absolute'])
    1110                actual.level = actual.level - 1
    1111                actual.switch_by_level = previous_switch_by_level
    1112                
    1113            pass
    1114    
    1115    
    1116        def cull(self, line):
    1117            if line:
    1118                actual.para_counter_increment()
    1119    
    1120                if (share.para_cumulative >= share.begin_para) and \
                   (share.para_cumulative <= share.finish_para):
    1122                    row_id = share.row_id_return(actual.volume,
    1123                                                   actual.text,
    1124                                                   actual.line)
    1125                    if row_id in expected.line_lengths.keys():
    1126                        line_out = string.join([row_id,
    1127                                                line[:expected.line_lengths[row_id]]])
    1128                        pass
    1129                    else:
    1130                        line_out = string.join([row_id,
    1131                                                '-MissingFromExpected-',
    1132                                                line[:share.first_lines_max_chars]])
    1133                        pass
    1134    
    1135                    if share.debug:
    1136                        share.prt('fileobj_w.write> ' + line_out)
    1137                        pass
    1138                    actual.fileobj_w.write(line_out + '\n')
    1139                    pass
    1140                pass
    1141            pass
    1142    
    1143    
    1144        def cull_first_line(self):
    1145            if (self.first_line_switch and not self.chunk_ignore_switch):
    1146                line = self.first_line
    1147                self.first_line = ''
    1148    
    1149                line = share.re['white_spaces_leading'].sub('', line)
    1150                line = string.join(string.split(line))
    1151                
    1152                self.cull(line)
    1153                pass
    1154            pass
    1155    
    1156        
    1157        def handle_first_line(self):
    1158            #
    1159            self.flush()
    1160            switch = 't'
    1161            #
    1162            # Skip: ``<p>&#160</p>''
    1163            
    1164            if self.chunk_ignore_switch: switch = None
    1165            elif not actual.switch_by_level: switch = None
    1166            #
    1167            if switch and self.first_line:
    1168                self.cull_first_line()
    1169                pass
    1170            pass
    1171    
    1172        pass #                                             Class 
    1173    
    1174    
    1175    
    1176    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    1177    share = Share(info)
    1178    
    1179    month_day_year = time.strftime("%B %d, %Y", time.localtime(time.time()))
    1180    
    1181    n = 70
    1182    share.prt('')
    1183    share.prt(share.info['title'] + '\n')
    1184    share.prt('-' * n)
    1185    share.prt(' Quality check:  First line of every paragraph, expected vs. actual.')
    1186    share.prt('      See also:  http://www.lafn.org/~cymbala/Lia/lia_qual.html\n')
    1187    
    1188    share.prt('      EXPECTED: ' + share.info['input_expected'])
    1189    share.prt('        ACTUAL: ' + share.info['input_actual'] + '\n')
    1190    
    1191    share.prt('          DATE:  ' + month_day_year + '\n')
    1192    
    1193    share.prt('    begin_para:  ' + str(share.info['begin_para']))
    1194    share.prt('   finish_para:  ' + str(share.info['finish_para']))
    1195    share.prt('-' * n)
    1196    
    1197    expected = Expected()
    1198    share.text_max = expected()
    1199    actual = Actual()
    1200    actual()
    1201    share.diff()
    1202    share.bye_bye()
    1203    
    1204    
    1205    
    1206    # To-do:
    1207    #=======
    1208    #   - if no overlap between expected and actual, print message.
    1209    #
    1210    #   - IMG outputs first_line = TABLE.
    1211    #
    1212    #   - Lisp incrementer: abort if previous line has higher id.
    1213    #   - Lisp incrementer: change values in '()'.
    1214    
    1215    
    1216    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    1217    #            Works well on 2001.09.19  (Volume 1).
    1218    #        Easier to read on 2002.06.20  (uses Unix diff).
    1219    #
    1220    ###
    1221    # Local variables:
    1222    # py-indent-offset: 4
    1223    # End:
    1224    #