1 #!/usr/bin/env python 2 # Time-stamp: <2003-05-15 05:16:10 cymbala> 3 # 4 # For more information about this script, see: 5 # <URL: http://www.lafn.org/~cymbala/Lia/lia_qual.html> 6 7 # SYNTAX: python lia_1st_.py [-n y | --finish_para=y] [--begin_para=x] 8 # ------------------------------------------------------------------ 9 10 11 info = {} 12 info['title'] = 'LENIN COLLECTED WORKS' 13 info['py_name'] = 'lia_1st_' 14 info['debug'] = 0 15 info['debug_echo_htm'] = 0 16 info['v'] = '01' 17 info['v'] = '02' 18 info['v'] = '03' 19 info['v'] = '14' 20 info['v'] = '22' 21 info['v'] = '23' 22 info['v'] = '24' 23 info['v'] = '25' 24 info['v'] = '26' 25 info['v'] = '27' 26 info['v'] = '28' 27 info['v'] = '29' 28 info['v'] = '30' 29 info['begin_para'] = 1 30 info['finish_para'] = 10000 31 32 33 from sgmllib import SGMLParser 34 import fileinput, formatter 35 import getopt 36 import os 37 import popen2 38 import re 39 import string, sys 40 import tempfile, time, types 41 42 43 class Share: 44 """___""" 45 46 def __init__compile_regexps__(self): 47 self.re = {} 48 self.re['comment'] = re.compile('^[\s]*#') 49 self.re['letter'] = re.compile('[a-z]', re.I) 50 self.re['special_emdash'] = re.compile("—") 51 self.re['special_rsquo'] = re.compile("’") # Try C-r ' on that one! 52 self.re['special_ldquo'] = re.compile("“") 53 self.re['special_rdquo'] = re.compile("”") 54 self.re['white_spaces_leading'] = re.compile('^\s+') 55 self.re['newline'] = re.compile('\n') 56 self.re['nonbreaking_spaces_ref'] = re.compile('( ?)') 57 self.re['comment_volpage'] = re.compile( 58 'v[^0-9]*([0-9]+)[^p]*p[^0-9]*([0-9]+)', re.I) 59 # 60 self.re['numbers'] = re.compile('^([0-9]+)$') 61 self.re['ast_two_numbers'] = re.compile('[^0-9]*([0-9]+)[^0-9]+([0-9]+)') 62 63 pass 64 65 def __init__(self, info): 66 self.__init__compile_regexps__() 67 68 self.info = info 69 70 optlist, args = getopt.getopt(sys.argv[1:], 'd:n:v:', 71 ['finish_para=', 'volume=' 72 'begin_para=', 'debug=']) 73 # If -n specified set begin_para to 1 unless --begin_para also specified. 74 spam = None 75 for i in optlist: 76 if i[0] == '-d' or i[0] == '--debug': 77 self.info['debug'] = int(i[1]) # int() very important! 78 pass 79 elif i[0] == '-v' or i[0] == '--volume': 80 self.info['v'] = i[1] 81 pass 82 elif i[0] == '-n' or i[0] == '--finish_para': 83 if not spam: self.info['begin_para'] = 1 84 self.info['finish_para'] = i[1] 85 pass 86 elif i[0] == '--begin_para': 87 self.info['begin_para'] = i[1] 88 spam = 't' 89 pass 90 print i 91 print "zzzz" + str(self.info['debug']) + "yyy" + str(i[1]) 92 pass 93 self.info['begin_para'] = int(self.info['begin_para']) 94 self.info['finish_para'] = int(self.info['finish_para']) 95 96 self.info['input_actual'] = '~/www.marxists.org/archive/lenin/works/cw/volume'+ self.info['v'] +'.htm' 97 self.info['input_expected'] = '~/www.marxists.org/archive/lenin/works/cw/lia_1st_'+ self.info['v'] +'.txt' 98 99 100 self.debug = self.info['debug'] 101 print str(self.debug) + "!" 102 self.debug_echo_htm = self.info['debug_echo_htm'] 103 104 self.tempfile = tempfile.mktemp() 105 self.report_filename = self.create_rptname(0) 106 self.reportobj_w = open(self.report_filename, 'w') 107 # if share.debug: self.prt('\ntemporary file: ' + self.tempfile) 108 self.tempfiles = [] 109 110 self.begin_para = self.info['begin_para'] 111 self.finish_para = self.info['finish_para'] 112 113 self.charref_four = {} 114 self.charref_four['147'] = '"' 115 self.charref_four['148'] = '"' 116 self.charref_four['8212'] = "--" 117 self.charref_four['8217'] = "'" 118 self.charref_four['8220'] = '"' 119 self.charref_four['8221'] = '"' 120 121 self.first_lines_max_chars = 80 122 123 pass 124 125 def create_rptname(self, n): 126 filename = self.tempfile + '_' + self.info['py_name'] 127 filename = filename + ('%02d' % n) + '.rpt' 128 return filename 129 pass 130 131 def os_path_join_norm(self, base, anchor): 132 """___""" 133 # 134 return os.path.expanduser(os.path.normpath(os.path.join(base, anchor))) 135 136 137 def expanduser(self, name): 138 """___""" 139 # Return with user expanded. 140 name = os.path.expanduser(name) 141 return name 142 pass 143 144 145 def filename(self, filename): 146 # Return a filename. 147 filename = self.expanduser(filename) 148 if not os.path.isfile(filename): 149 raise 'Not an existing regular file: ' + filename 150 pass 151 if share.debug: share.prt('\n*** filename *** ' + filename) 152 return filename 153 pass 154 155 156 def fileobj_w(self, filename): 157 # Return a file object for writing. 158 self.tempfiles.append(filename) 159 return open(filename, 'w') 160 161 162 def row_id_return(self, v, t, l): 163 return string.join(['%03d' % v, 164 '%03d' % t, 165 '%04d' % l, 166 ], '.') 167 pass 168 169 170 def diff(self): 171 filearg1 = expected.fileobj_name 172 filearg2 = actual.fileobj_name 173 cmd = 'diff ' + filearg1 174 cmd = cmd + ' ' + filearg2 175 176 child_stdout, child_stdin = popen2.popen2(cmd) 177 share.prt('\nRESULTS:') 178 share.prt('< ' + filearg1 + '') 179 share.prt('> ' + filearg2 + '\n') 180 share.prt(child_stdout.read()) 181 182 child_stdin.close() 183 child_stdout.close() 184 pass 185 186 187 def prt(self, message): 188 print message 189 self.reportobj_w.write(message + '\n') 190 pass 191 192 193 def bye_bye(self): 194 # Remove temporary files. 195 if not share.debug: 196 for filename in self.tempfiles: 197 os.remove(filename) 198 pass 199 pass 200 201 share.prt('\nDone.') 202 203 # To find files that aren't link, print names of files parsed. 204 share.prt('') 205 for i in actual.seen_these_before: 206 # Delete leading "/home/cymbala/www.marxists.org/" 207 match_obj = re.compile('.*\.org/', re.I).match(i) 208 if match_obj: share.prt(i[match_obj.end(0):]) 209 else: share.prt(i) 210 pass 211 share.prt('') 212 213 self.reportobj_w.close() 214 215 filename = self.create_rptname(self.volume) 216 os.rename(self.report_filename, filename) 217 pass 218 219 220 class Expected: 221 """___""" 222 223 def __init__compile_regexps__(self): 224 self.re = {} 225 self.re['comment_first_lines'] = re.compile('^([^ ]+[ ]*)?#') 226 227 # 000: 001-045 228 self.re['volume_range'] = re.compile( 229 '^000: ([0-9]+)-([0-9]+)$') 230 # 001.000: 001-004 231 self.re['text_range'] = re.compile( 232 '^([0-9]{3})\.000: ([0-9]+)-([0-9]+)$') 233 234 self.triad = '^([0-9]{3})\.([0-9]{3})\.([0-9]{4})' 235 236 # 001.001.0000: (+ 0 123) 123 237 self.re['line_range'] = re.compile( 238 self.triad + ':?[ ]+\(\+ ([0-9]+) ([0-9]+)\) ([0-9]+)$') 239 # 001.001.0000 () Title 240 self.re['title'] = re.compile( 241 self.triad + '[ ][ ]\(\)[ ]*(.*)$') 242 # 001.001.0000 not_space_or_hash 243 self.re['first_line'] = re.compile( 244 self.triad + '[ ]([^ ].*)$') 245 246 #self.re[''] = re.compile() 247 pass 248 249 def __init__(self): 250 self.__init__compile_regexps__() 251 252 self.instance_of = 'expected' 253 254 self.line_lengths = {} 255 256 pass 257 258 def __call__(self): 259 share.para_cumulative = 0 260 261 return self.read_input() 262 pass 263 264 265 def para_counter_reset(self): 266 self.line = 0 267 pass 268 def text_counter_reset(self): 269 self.text = 0 270 pass 271 def para_counter_increment(self): 272 if self.increment_para_counter_flag: 273 self.line = self.line + 1 274 share.para_cumulative = share.para_cumulative + 1 275 pass 276 pass 277 def text_counter_increment(self): 278 self.test_end_of_text() 279 # 280 if self.increment_text_counter_flag: 281 self.text = self.text + 1 282 if share.debug: share.prt('TEXT_COUNTER_INCREMENT: ' + str(self.text)) 283 pass 284 # 285 self.increment_text_counter_flag = 0 286 pass 287 288 def test_end_of_text(self): 289 290 # Between two texts, end of previous text is marked by: 291 # 003.002.0000 () Uncritical Criticism 292 # 003.002.0000: (+ 0 27) 27 293 # 294 # or: 295 # 003.002.0000: (+ 0 27) 27 296 # 003.002.0000 () Uncritical Criticism 297 # 298 # ...or not, if file is sorted... 299 # 300 # ...so, this test happens during new title AND during 301 # new line range, WHICHEVER COMES FIRST. 302 303 if self.test_end_of_text_flag: 304 if not self.line == self.line_end: 305 a = str(self.line) 306 b = str(self.line_end) 307 z = 'Text ' + str(self.text) + ': ' 308 raise z + 'Found ' + a + ' paragraphs, expected ' + b 309 pass 310 self.test_end_of_text_flag = 0 311 pass 312 313 def read_input(self): 314 # Read expected data. 315 filename = share.filename( 316 share.info['input_' + string.lower(self.instance_of)]) 317 318 self.fileobj_name = share.tempfile + '_' + self.instance_of + '_data' 319 self.fileobj_w = share.fileobj_w(self.fileobj_name) 320 self.found_volume_range = 0 321 self.found_text_range = 0 322 self.volume_min = self.text_min = 2 323 self.volume_max = self.text_max = 0 324 325 self.test_end_of_text_flag = 0 326 self.increment_text_counter_flag = 1 327 self.increment_para_counter_flag = 1 328 self.text_counter_reset() 329 330 for line in fileinput.input(filename): 331 332 if self.re['comment_first_lines'].match(line): 333 continue 334 335 self.fall_through = 0 336 337 # 000: 001-045 338 match_obj = self.re['volume_range'].match(line) 339 if match_obj and not self.fall_through: 340 self.fall_through = 1 341 342 self.found_volume_range = 1 343 self.volume_min = int(match_obj.group(1)) 344 self.volume_max = int(match_obj.group(2)) 345 346 if share.debug: 347 share.prt('Volume Start : ' + str(self.volume_min)) 348 pass 349 if share.debug: 350 share.prt('Volume End : ' + str(self.volume_max)) 351 pass 352 353 pass 354 355 # 001.000: 001-004 356 match_obj = self.re['text_range'].match(line) 357 if match_obj and not self.fall_through: 358 self.fall_through = 1 359 360 self.found_text_range = 1 361 self.volume = int(match_obj.group(1)) 362 self.text_min = int(match_obj.group(2)) 363 self.text_max = int(match_obj.group(3)) 364 365 share.prt('\nVOLUME: ' + str(self.volume)) 366 367 if share.debug: 368 share.prt('VOLUME ------> ' + str(self.volume)) 369 pass 370 if share.debug: 371 share.prt('Text Start : ' + str(self.text_min)) 372 pass 373 if share.debug: 374 share.prt('Text End : ' + str(self.text_max)) 375 pass 376 377 pass 378 379 # 001.001.0000: (+ 0 123) 123 380 match_obj = self.re['line_range'].match(line) 381 if match_obj and not self.fall_through: 382 self.fall_through = 1 383 384 if 0 == int(match_obj.group(3)): 385 386 self.text_counter_increment() 387 388 self.line_start = int(match_obj.group(5)) 389 self.line_end = int(match_obj.group(6)) 390 391 if not self.line_start == self.line_end: 392 raise 'Invalid format: ' + line 393 else: 394 self.line_start = 1 395 pass 396 397 if share.debug: 398 share.prt('TEXT ------> ' + str(self.text)) 399 pass 400 if share.debug: 401 share.prt('Line Start : ' + str(self.line_start)) 402 pass 403 if share.debug: 404 share.prt('Line End : ' + str(self.line_end)) 405 pass 406 407 expected.para_counter_reset() 408 409 pass 410 411 if not self.volume == int(match_obj.group(1)): 412 raise str(self.volume) + ': Unexpected volume number: ' + line 413 elif not self.text == int(match_obj.group(2)): 414 raise str(self.text) + ': Unexpected text number: ' + line 415 416 pass 417 418 # 001.001.0001 () The Development of 419 match_obj = self.re['title'].match(line) 420 if match_obj and not self.fall_through: 421 self.fall_through = 1 422 423 if 0 == int(match_obj.group(3)): 424 self.text_counter_increment() 425 expected.para_counter_reset() 426 self.title = match_obj.group(4) 427 share.prt('\nTitle (' + str(self.text) + '): ' + self.title) 428 pass 429 else: 430 self.subtitle = match_obj.group(4) 431 if share.debug: 432 if self.subtitle: 433 share.prt(' subtitle: ' + self.subtitle) 434 pass 435 pass 436 pass 437 438 pass 439 440 # 001.001.0001 Cover of the first edition of 441 match_obj = self.re['first_line'].match(line) 442 if match_obj and not self.fall_through: 443 self.fall_through = 1 444 445 self.para_counter_increment() 446 447 if self.line == 1: 448 self.increment_text_counter_flag = 1 449 self.test_end_of_text_flag = 1 450 pass 451 452 if not self.volume == int(match_obj.group(1)): 453 raise str(self.volume) + ': Unexpected volume number: ' + line 454 elif not self.text == int(match_obj.group(2)): 455 raise str(self.text) + ': Unexpected text number: ' + line 456 elif not self.line == int(match_obj.group(3)): 457 raise str(self.line) + ': Unexpected line number: ' + line 458 459 if (share.para_cumulative >= share.begin_para) and \ (share.para_cumulative <= share.finish_para): 461 row_id = share.row_id_return(expected.volume, 462 expected.text, 463 expected.line) 464 self.line_lengths[row_id] = len(match_obj.group(4)) 465 466 self.fileobj_w.write(line) 467 pass 468 pass 469 470 if not self.fall_through: 471 raise 'Line not trapped: ' + line 472 pass 473 474 pass 475 476 self.test_end_of_text() 477 478 if not self.found_volume_range: 479 raise 'Did not find volume range in: ' + filename 480 elif not self.found_text_range: 481 raise 'Did not find text range in: ' + filename 482 elif not self.text == self.text_max: 483 a = str(self.text) 484 b = str(self.text_max) 485 raise 'Found ' + a + ' texts, expected ' + b 486 487 self.fileobj_w.close() 488 return self.text_max 489 pass 490 491 492 493 class Actual: 494 """___""" 495 496 def __init__compile_regexps__(self): 497 self.re = {} 498 self.re['dot_htm'] = re.compile('[.]html?$', re.I) 499 self.re['volume_htm'] = re.compile('/volume([^.]+)[.]html?$', re.I) 500 self.re['index_htm'] = re.compile('/index[.]html?$', re.I) 501 502 pass 503 504 def __init__(self): 505 self.__init__compile_regexps__() 506 507 self.instance_of = 'actual' 508 509 self.filename_stack = [] 510 self.seen_these_before = [] 511 512 share.info['ignore_tags_first_line'] = ['a', 513 'em', 514 'span', 515 'sup'] 516 self.switch_by_level = None 517 518 share.info['ignore_with_attrs_of_'] = {} 519 # Not: quote, sig. 520 # Not sure: indexa. 521 share.info['ignore_with_attrs_of_']['p'] = (('class', 'footer'), 522 ('class', 'indent'), 523 ('class', 'index'), 524 ('class', 'indexa'), 525 ('class', 'indexb'), 526 ('class', 'indexc'), 527 ('class', 'index-list'), 528 ('class', 'information'), 529 ('class', 'head'), 530 ('class', 'next'), 531 ('class', 'pagenote'), 532 ('class', 'pagenoteb'), 533 ('class', 'sub'), 534 ('class', 'toc')) 535 share.info['ignore_with_attrs_of_']['blockquote'] = (('class', 'abcdefg'), 536 ('class', 'lmnopqr')) 537 # ('class', 'quote'), 538 539 pass 540 541 542 def custom_io(self, action, filename): 543 """___""" 544 # 545 if action == 'open': 546 # Allows opening of a 2nd file before end of 1st file. 547 # 548 # Change to urllib.urlopen ... 549 550 return_list = [] 551 for line in fileinput.input(filename): 552 return_list.append(line) 553 pass 554 fileinput.close() 555 self.filename_stack.append(filename) 556 return return_list 557 elif action == 'close': 558 del self.filename_stack[-1] 559 return None 560 else: raise 'Unknown action: ' + action 561 562 563 def actual_fileinput(self, filename): 564 """___""" 565 566 if share.debug: 567 share.prt('*** ACTUAL_FILEINPUT *** Actual :: ' + filename) 568 pass 569 if actual.text > share.text_max: 570 if share.debug: share.prt('TEXT_NOT_IN_EXPECTED: ' + filename) 571 return 572 573 if actual.level == 1: 574 actual.switch_by_level = 't' 575 actual.text_counter_increment() 576 actual.para_counter_reset() 577 pass 578 579 # SEE: "test" in /usr/lib/python1.5/htmllib.py for example. 580 # formatter.NullFormatter() 581 # formatter.AbstractFormatter(formatter.DumbWriter()) 582 583 if share.debug: share.prt('\nCREATING PARSER...') 584 parser = ACTUALParser(formatter.NullFormatter()) 585 parser.chunk_ignore_switch = None 586 # -- parser.tag_ignore_switch = None 587 588 parser.filename_of_source_data = filename 589 parser.basename_of_source_data = os.path.basename(filename) 590 591 # Required for actual since order does not matter. 592 self.row_id_previous = None 593 594 lines_list = actual.custom_io('open', filename) 595 596 # For reporting unbalanced tags and out-of-sequence volpage. 597 self.source_filename = filename 598 599 for i in range(len(lines_list)): 600 601 # For reporting unbalanced tags. 602 self.unbalanced_i = i 603 604 line = lines_list[i] 605 if share.debug_echo_htm: share.prt( 606 parser.basename_of_source_data + ' >' + line[:-1]) 607 608 # Non-breaking space: 609 if share.re['nonbreaking_spaces_ref'].search(line): 610 line = share.re['nonbreaking_spaces_ref'].sub(' ', line) 611 pass 612 613 parser.line = line 614 parser.feed(line) 615 pass 616 parser.close() 617 actual.custom_io('close', None) 618 return 0 619 620 621 def __call__(self): 622 share.para_cumulative = 0 623 624 filename = share.filename( 625 share.info['input_' + string.lower(self.instance_of)]) 626 627 self.level = 0 628 629 match_obj = actual.re['volume_htm'].search(filename) 630 if match_obj: 631 632 self.volume = share.volume = int(match_obj.group(1)) 633 if not actual.volume == expected.volume: 634 raise 'Volumes different: expected=' + str( 635 expected.volume) + ' actual=' + str( 636 actual.volume) 637 pass 638 pass 639 else: 640 raise filename + ' failed: ' + actual.re['volume_htm'].pattern 641 642 self.fileobj_name = share.tempfile + '_' + self.instance_of + '_data' 643 self.fileobj_w = share.fileobj_w(self.fileobj_name) 644 self.read_input(filename) 645 self.fileobj_w.close() 646 647 pass 648 649 650 def para_counter_reset(self): 651 self.line = 0 652 pass 653 def text_counter_reset(self): 654 self.text = 0 655 pass 656 def para_counter_increment(self): 657 self.line = self.line + 1 658 share.para_cumulative = share.para_cumulative + 1 659 pass 660 def text_counter_increment(self): 661 self.text = self.text + 1 662 if share.debug: share.prt('TEXT_COUNTER_INCREMENT: ' + str(self.text)) 663 pass 664 665 666 def read_input(self, filename): 667 self.text_counter_reset() 668 self.actual_fileinput(filename) 669 pass 670 671 672 def anchors_to_exclude(self, url): 673 """___""" 674 675 paths = string.split(url, '/') 676 # [ ... 'archive', 'lenin', 'works', 'cw', 'volume01.htm'] 677 switch = None 678 679 # When reading volume99.htm file, remember dirname in 680 # order to exclude files outside dirname's branch. 681 level0_dirname = os.path.dirname(url) 682 if share.debug: 683 share.prt('\nINPUT: ' + url) 684 share.prt('restricting to dir: ' + level0_dirname) 685 pass 686 if not actual.level0_dirname == url[:len(actual.level0_dirname)]: 687 switch = 't' 688 pass 689 690 if url in actual.seen_these_before: switch = 't' 691 else: 692 for i in range(len(paths)): 693 # == /archive/ 694 if paths[i] == 'archive': 695 if re.compile('^index[.]html?$').match(paths[-1]): 696 # == COLLECTED WORKS index 697 if paths[-2] == 'cw': switch = 't' 698 pass 699 # == Not enough pieces ("archive/lenin/works/index.htm") 700 if ((i + 4) > (len(paths) - 1)): switch = 't' 701 # 702 # == 'x' file (see: 703 # == marxists.org/admin/workshop/info/file-structure.txt 704 # elif paths[-1][:1] == 'x': switch = 't' 705 # 706 # == /archive/(not lenin)/ 707 elif paths[i + 1] != 'lenin': switch = 't' 708 pass 709 # == /glossary/ 710 elif paths[i] == 'admin': switch = 't' 711 elif paths[i] == 'glossary': switch = 't' 712 elif paths[i] == 'subject': switch = 't' 713 pass 714 pass 715 return switch 716 717 718 def seen_it_before(self, candidate): 719 """___""" 720 # 721 if not candidate in actual.seen_these_before: 722 actual.seen_these_before.append(candidate) 723 return None 724 else: 725 return candidate 726 pass 727 pass 728 729 730 class ACTUALSGMLParser(SGMLParser): 731 """Same as SGMLParser but report _where_ unbalanced tag exists""" 732 733 # Example -- report an unbalanced </...> tag. 734 def report_unbalanced(self, tag): 735 if self.verbose: 736 msg = '--- Line=' + str(actual.unbalanced_i) 737 msg = msg + ' File=' + actual.source_filename 738 share.prt(msg) 739 share.prt('*-* Unbalanced </' + tag + '>') 740 share.prt('*-* Stack: ' + `self.stack`) 741 pass 742 pass 743 pass 744 745 746 747 class ACTUALParser(ACTUALSGMLParser): 748 """___""" 749 # 750 # Using SGMLParser instead of HTMLParser by incorporating 751 # just a few pieces from HTMLParser here. 752 753 # 754 # Copied from TestSGMLParser in /usr/lib/python1.5/sgmllib.py. 755 # ------------------------------------------------------------ 756 def __init__(self, verbose=0): 757 self.actualdata = '' 758 self.anchorlist = [] 759 self.volpage = {'v': 0, 'p': 0} 760 self.base = None 761 SGMLParser.__init__(self, verbose) 762 self.first_line = '' 763 self.first_line_switch = None 764 pass 765 766 def handle_data(self, data): 767 # 768 # -- if not self.tag_ignore_switch: 769 if len(self.stack) > 0: 770 # This "not 'span'" seems wrong ... 2002.04.03 ... 771 if not 'span' in self.stack: 772 data = share.re['white_spaces_leading'].sub(' ', data) 773 data = share.re['special_emdash'].sub("--", data) 774 data = share.re['special_rsquo'].sub("'", data) 775 data = share.re['special_ldquo'].sub('"', data) 776 data = share.re['special_rdquo'].sub('"', data) 777 self.actualdata = self.actualdata + data 778 pass 779 pass 780 if len(`self.actualdata`) >= 70: 781 self.flush() 782 783 def flush(self): 784 data = self.actualdata 785 if data: 786 # 787 if self.first_line_switch: 788 if len(self.first_line) < share.first_lines_max_chars: 789 self.first_line = self.first_line + data 790 pass 791 # 792 # Defaults: 793 self.actualdata = '' 794 # print 'data:', `data` 795 pass 796 pass 797 798 def volpage_check(self, mygroup): 799 if self.volpage['v'] == 0: 800 self.volpage['v'] = int(mygroup(1)) 801 self.volpage['p'] = int(mygroup(2)) 802 pass 803 else: 804 if self.volpage['v'] <> int(mygroup(1)): 805 share.prt('\nINVALID VOLUME in ' 806 + actual.source_filename + ': ' 807 + mygroup(0)) 808 pass 809 if (self.volpage['p'] + 1) <> int(mygroup(2)): 810 share.prt('\nINVALID PAGE in ' 811 + actual.source_filename + ': ' 812 + mygroup(0)) 813 pass 814 else: 815 self.volpage['p'] = int(mygroup(2)) 816 pass 817 pass 818 pass 819 def handle_comment(self, data): 820 self.flush() 821 if len(data) < 100: 822 # EXAMPLE: 823 # <!-- vol 07 page 34 --> 824 match_obj = share.re['comment_volpage'].search(data) 825 if match_obj: 826 self.volpage_check(match_obj.group) 827 pass 828 pass 829 830 def unknown_starttag(self, tag, attrs): 831 self.flush() 832 pass 833 def unknown_endtag(self, tag): 834 self.flush() 835 pass 836 def unknown_entityref(self, ref): 837 self.flush() 838 pass 839 def unknown_charref(self, ref): 840 if ref in share.charref_four.keys(): char = share.charref_four[ref] 841 else: char = '~' 842 self.handle_data(char) 843 pass 844 845 def close(self): 846 SGMLParser.close(self) 847 pass 848 849 # Overridable -- handle start tag 850 def handle_starttag(self, tag, method, attrs): 851 # 852 if not tag in share.info['ignore_tags_first_line']: 853 self.handle_first_line() 854 self.first_line_switch = None 855 pass 856 # 857 # Default definition: 858 method(attrs) 859 pass 860 861 # Overridable -- handle end tag 862 def handle_endtag(self, tag, method): 863 # 864 # Ignore everything starting with footnotes. 865 if ((self.actualdata == 'Footnotes' or 866 self.actualdata == "Author's Footnotes" or 867 self.actualdata == 'Endnotes' or 868 self.actualdata == "Editor's Endnotes") 869 and 870 (tag == 'h2' or tag == 'h3' or tag == 'h4')): 871 self.chunk_ignore_switch = 't' 872 pass 873 if not tag in share.info['ignore_tags_first_line']: 874 self.handle_first_line() 875 self.first_line_switch = None 876 pass 877 # 878 # Default definition: 879 method() 880 pass 881 882 883 # Do (ones that don't require a separate end tag). 884 def do_hr(self, attrs): 885 pass 886 def do_img(self, attrs): 887 if actual.switch_by_level: 888 self.first_line_switch = 't' 889 self.first_line = 'IMAGE ' 890 891 # Exclude heading (H2, H3,...) GIFs in "The State and Revolution" 892 switch = 't' 893 for tuple in attrs: 894 if string.upper(tuple[0]) == 'SRC': 895 if string.lower(tuple[1]) in ['pics/chpt1.gif', 896 'pics/chpt2.gif', 897 'pics/chpt3.gif', 898 'pics/chpt4.gif', 899 'pics/chpt5.gif', 900 'pics/chpt6.gif', 901 'pics/index.gif', 902 'pics/intro.gif', 903 'pics/manuscpt.gif', 904 'pics/name.gif', 905 'pics/postscpt.gif', 906 'pics/preface.gif']: 907 self.first_line_switch = None 908 self.first_line = '' 909 pass 910 pass 911 elif string.upper(tuple[0]) == 'ONMOUSEOUT': 912 if string.upper(tuple[1]) == 'T': 913 self.first_line_switch = None 914 self.first_line = '' 915 pass 916 pass 917 pass 918 self.cull_first_line() 919 pass 920 pass 921 def do_base(self, attrs): 922 """___ htmllib.py""" 923 for a, v in attrs: 924 if a == 'href': 925 self.base = v 926 pass 927 pass 928 pass 929 930 # Empty definitions to get tags into stack. 931 def start_body(self, attrs): 932 pass 933 def start_center(self, attrs): 934 pass 935 def start_em(self, attrs): 936 pass 937 def start_head(self, attrs): 938 pass 939 def start_html(self, attrs): 940 pass 941 def start_h1(self, attrs): 942 pass 943 def end_h1(self): 944 pass 945 def start_h2(self, attrs): 946 pass 947 # 948 def end_blockquote(self): 949 pass 950 def end_body(self): 951 pass 952 def end_h2(self): 953 pass 954 def start_h3(self, attrs): 955 pass 956 def end_h3(self): 957 pass 958 def start_h4(self, attrs): 959 pass 960 def end_h4(self): 961 pass 962 def start_h5(self, attrs): 963 pass 964 def end_h5(self): 965 pass 966 def start_h6(self, attrs): 967 pass 968 def end_h6(self): 969 pass 970 def end_p(self): 971 pass 972 973 # Blockquote's and p's will output first-lines. 974 def start_output_wrapper(self, tag, attrs): 975 switch = None 976 if self.stack == ['html', 'body', tag]: switch = 't' 977 elif self.stack == ['html', 'body', 'blockquote', tag]: switch = 't' 978 # 979 for attr in attrs: 980 if attr in share.info['ignore_with_attrs_of_'][tag]: switch = None 981 pass 982 if (switch and 983 actual.switch_by_level): self.first_line_switch = 't' 984 pass 985 def start_blockquote(self, attrs): 986 self.start_output_wrapper('blockquote', attrs) 987 pass 988 def start_p(self, attrs): 989 self.start_output_wrapper('p', attrs) 990 pass 991 992 def start_span(self, attrs): 993 pass 994 def start_style(self, attrs): 995 pass 996 def start_table(self, attrs): 997 if not actual.re['index_htm'].search(self.filename_of_source_data): 998 if actual.switch_by_level: 999 self.first_line_switch = 't' 1000 self.first_line = 'TABLE ' 1001 self.cull_first_line() 1002 pass 1003 pass 1004 self.first_line_switch = None 1005 pass 1006 def start_title(self, attrs): 1007 pass 1008 1009 def start_a(self, attrs): 1010 """___ from htmllib.py """ 1011 href = '' 1012 name = '' 1013 type = '' 1014 onblur = '' 1015 for attrname, value in attrs: 1016 value = string.strip(value) 1017 if attrname == 'href': href = value 1018 elif attrname == 'name': name = value 1019 elif attrname == 'type': type = string.lower(value) 1020 elif attrname == 'onblur': onblur = 't' 1021 pass 1022 self.anchor_bgn(href, name, type, onblur) 1023 pass 1024 # 1025 def end_a(self): 1026 self.anchor_end() 1027 pass 1028 # 1029 def anchor_bgn(self, href, name, type, onblur): 1030 self.anchor = href 1031 self.onblur = onblur 1032 if self.anchor: 1033 self.anchorlist.append(href) 1034 pass 1035 pass 1036 # 1037 def anchor_end(self): 1038 if (self.anchor and (not self.onblur)): 1039 # self.handle_data("[%d]" % len(self.anchorlist)) # RjC -2001.09.02 1040 # 1041 dict = {'tagname': 'a', 1042 'anchor': self.anchor, 1043 'data': self.actualdata 1044 } 1045 1046 if self.base: dict['base'] = self.base 1047 else: dict['base'] = os.path.dirname(actual.filename_stack[-1]) 1048 1049 filename = share.os_path_join_norm(dict['base'], dict['anchor']) 1050 dict['absolute'] = filename 1051 1052 self.handle_data_anchor(dict) # RjC -2001.09.02 1053 # 1054 self.anchor = None 1055 pass 1056 pass 1057 1058 1059 def handle_data_anchor(self, dict): 1060 """___""" 1061 # 1062 filename = dict['absolute'] 1063 1064 if filename in actual.seen_these_before: 1065 if share.debug: share.prt('ALREADY SEEN: ' + filename) 1066 return 1067 1068 # 1069 if 'data' in dict.keys(): 1070 # Remove newlines, redundant spaces, leading spaces, etc. 1071 if dict['data']: 1072 dict['data'] = string.join(string.split(dict['data'])) 1073 pass 1074 pass 1075 # 1076 if actual.re['dot_htm'].search(filename): 1077 # 1078 # Exclude certain anchors, similar to wget's --no-parent but 1079 # not quite because subdirectory `works/cw' is at the same level 1080 # as children of the hierarchy such as `works/1906' 1081 # 1082 1083 if actual.level == 0: 1084 # When reading volume99.htm file, remember dirname in 1085 # order to exclude files outside dirname's branch. 1086 actual.level0_dirname = os.path.dirname(dict['absolute']) 1087 if share.debug: 1088 share.prt('\nINPUT: ' + dict['absolute']) 1089 share.prt('restricting to dir: ' + actual.level0_dirname) 1090 pass 1091 1092 # 2002.10.07: actual.text_counter_increment() 1093 actual.para_counter_reset() 1094 actual.switch_by_level = 't' 1095 pass 1096 1097 if actual.anchors_to_exclude(filename): 1098 if share.debug: share.prt('ANCHORS_TO_EXCLUDE: ' + filename) 1099 return 1100 if actual.seen_it_before(filename): 1101 if share.debug: share.prt('SEEN_IT_BEFORE: ' + filename) 1102 return 1103 1104 if share.debug: share.prt('\nFOUND ANCHOR: ' + filename) 1105 if share.debug: share.prt('\nDICT: ' + str(dict)) 1106 if share.debug: share.prt('-' * 77) 1107 previous_switch_by_level = actual.switch_by_level 1108 actual.level = actual.level + 1 1109 actual.actual_fileinput(dict['absolute']) 1110 actual.level = actual.level - 1 1111 actual.switch_by_level = previous_switch_by_level 1112 1113 pass 1114 1115 1116 def cull(self, line): 1117 if line: 1118 actual.para_counter_increment() 1119 1120 if (share.para_cumulative >= share.begin_para) and \ (share.para_cumulative <= share.finish_para): 1122 row_id = share.row_id_return(actual.volume, 1123 actual.text, 1124 actual.line) 1125 if row_id in expected.line_lengths.keys(): 1126 line_out = string.join([row_id, 1127 line[:expected.line_lengths[row_id]]]) 1128 pass 1129 else: 1130 line_out = string.join([row_id, 1131 '-MissingFromExpected-', 1132 line[:share.first_lines_max_chars]]) 1133 pass 1134 1135 if share.debug: 1136 share.prt('fileobj_w.write> ' + line_out) 1137 pass 1138 actual.fileobj_w.write(line_out + '\n') 1139 pass 1140 pass 1141 pass 1142 1143 1144 def cull_first_line(self): 1145 if (self.first_line_switch and not self.chunk_ignore_switch): 1146 line = self.first_line 1147 self.first_line = '' 1148 1149 line = share.re['white_spaces_leading'].sub('', line) 1150 line = string.join(string.split(line)) 1151 1152 self.cull(line) 1153 pass 1154 pass 1155 1156 1157 def handle_first_line(self): 1158 # 1159 self.flush() 1160 switch = 't' 1161 # 1162 # Skip: ``<p> </p>'' 1163 1164 if self.chunk_ignore_switch: switch = None 1165 elif not actual.switch_by_level: switch = None 1166 # 1167 if switch and self.first_line: 1168 self.cull_first_line() 1169 pass 1170 pass 1171 1172 pass # Class 1173 1174 1175 1176 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1177 share = Share(info) 1178 1179 month_day_year = time.strftime("%B %d, %Y", time.localtime(time.time())) 1180 1181 n = 70 1182 share.prt('') 1183 share.prt(share.info['title'] + '\n') 1184 share.prt('-' * n) 1185 share.prt(' Quality check: First line of every paragraph, expected vs. actual.') 1186 share.prt(' See also: http://www.lafn.org/~cymbala/Lia/lia_qual.html\n') 1187 1188 share.prt(' EXPECTED: ' + share.info['input_expected']) 1189 share.prt(' ACTUAL: ' + share.info['input_actual'] + '\n') 1190 1191 share.prt(' DATE: ' + month_day_year + '\n') 1192 1193 share.prt(' begin_para: ' + str(share.info['begin_para'])) 1194 share.prt(' finish_para: ' + str(share.info['finish_para'])) 1195 share.prt('-' * n) 1196 1197 expected = Expected() 1198 share.text_max = expected() 1199 actual = Actual() 1200 actual() 1201 share.diff() 1202 share.bye_bye() 1203 1204 1205 1206 # To-do: 1207 #======= 1208 # - if no overlap between expected and actual, print message. 1209 # 1210 # - IMG outputs first_line = TABLE. 1211 # 1212 # - Lisp incrementer: abort if previous line has higher id. 1213 # - Lisp incrementer: change values in '()'. 1214 1215 1216 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1217 # Works well on 2001.09.19 (Volume 1). 1218 # Easier to read on 2002.06.20 (uses Unix diff). 1219 # 1220 ### 1221 # Local variables: 1222 # py-indent-offset: 4 1223 # End: 1224 # |