1    #! /usr/bin/env python
       2    # -----------------------------------------------------------------------------
       3    import re
       4    import os
       5    import fileinput
       6    import string
       7    from stat import *
       8    import sys
       9    import tempfile
      10    
      11    
      12    
      13    information = {}
      14    
      15    # Acceptable error level.
      16    information['ERROR_LEVEL'] = 'medium'   #  CHECKING.
      17    information['ERROR_LEVEL'] = None       #  FINAL.
      18    
      19    # Script:
      20    information['SCRIPT'] = 'www_marxist_org.py'
      21    
      22    # Source of inputs:
      23    information['CD'] = '/cdrom'
      24    
      25    
      26    # -----------------------------------------------------------------------------
      27    class Www_marxists_org:
      28    
      29        def __init__(self, information):
      30            """DESCRIPTION"""
      31    
      32            self.i = information
      33    
      34            # Regular expressions:
      35            self.i['re'] = {}
      36            self.i['re']['marx'] = re.compile('marx', re.I)
      37            #
      38            # Use a negative lookahead assertion to exlude 'Mandela':
      39            self.i['re']['mandel'] = re.compile('mandel', re.I)
      40            self.i['re']['mandel'] = re.compile('(mandel(?!a)|germain)', re.I)
      41    
      42            # Check for CD:
      43            mount_lines = os.popen('mount').read()
      44            if string.find(mount_lines, self.i['CD']) > -1:
      45                self.__stderr__('Found CD.' + '\n')
      46                pass
      47            else:
      48                raise '\nCD not mounted: ' + self.i['CD'] + '\n--\n' + mount_lines
      49    
      50    
      51            # List of files on CD:
      52            #
      53            self.i['CD_FILELIST'] = os.path.join('/tmp', self.i['SCRIPT'] + '.cd-filelist')
      54            self.i['CD_FILELIST_RAW'] = self.i['CD_FILELIST'] + '.raw'
      55            self.i['CD_FILELIST_FILETYPES'] = self.i['CD_FILELIST'] + '.filetypes'
      56            self.CD_FILELIST()
      57            
      58            # Get filetypes.
      59            #
      60            self.i['CD_FILETYPES_ABSTRACT'] = self.i['CD_FILELIST_FILETYPES'] + '.abstract'
      61            self.CD_FILELIST_FILETYPES()
      62            
      63            pass
      64    
      65    
      66        def ERRORS(self, errtype):
      67            """DESCRIPTION"""
      68    
      69            if errtype == 'errtype__CD_FILELIST':
      70                #
      71                size_raw = os.stat(self.i['CD_FILELIST_RAW'])[ST_SIZE]
      72                size = os.stat(self.i['CD_FILELIST'])[ST_SIZE]
      73                if self.i['ERROR_LEVEL'] == None:
      74                    if size != size_raw:
      75                        raise '\nUnequal!' + '\n' + \
                              self.i['CD_FILELIST_RAW'] + '\n' + \
                              self.i['CD_FILELIST']
      78                    pass
      79                pass
      80    
      81            elif errtype == 'errtype__CD_FILELIST_FILETYPES':
      82                pass
      83    
      84            elif errtype == 'errtype__CD_FILETYPES_ABSTRACT':
      85                #
      86                # Check extension and filetype...
      87                #       ... all messed-up on 2001.06.30 ...
      88                # ...
      89                # ...
      90                # ...
      91                pass
      92    
      93            else:
      94                raise 'Unknown type: ' + str(errtype)
      95            
      96            pass
      97    
      98    
      99        def CD_FILELIST(self):
     100            """DESCRIPTION"""
     101            
     102            if not os.path.isfile(self.i['CD_FILELIST_RAW']):
     103                self.__stderr__('Creating list of files found on CD...' + '\n')
     104    
     105                # September 2000 has:
     106                # debian:/home/cymbala# egrep "'" www-marxists-org_mandel_search_2000-09.lst
     107                # /cdrom/subject/psychology/mia/archive/leontev/images/_vti_cnf/leont'ev.gif
     108                # /cdrom/subject/psychology/mia/archive/leontev/images/leont'ev.gif
     109                #
     110                # 12,581 on September 2000.
     111                # 12,579 on September 2000, without single-quotes in file name.
     112    
     113                p = os.popen('find ' + self.i['CD'] + ' -type f ' + \
                         # ' | egrep -v "\'" ' + \
     115                         ' > ' + self.i['CD_FILELIST_RAW'])
     116                p.close()
     117                pass
     118    
     119            if not os.path.isfile(self.i['CD_FILELIST']):
     120                p = os.popen('egrep -v "\'" ' + self.i['CD_FILELIST_RAW'] + \
                         ' > ' + self.i['CD_FILELIST'])
     122                p.close()
     123                pass
     124    
     125            # Any differences?
     126            # - self.ERRORS('errtype__CD_FILELIST')
     127            
     128            pass
     129    
     130    
     131        def CD_FILELIST_FILETYPES(self):
     132            """This attribute... """
     133    
     134            # ~$ awk '{print $NF; }' /tmp/www_marxist_org.py.cd-filelist.filetypes | \
     135            # > sort | uniq -c
     136            #       2	Document
     137            #    2188	data
     138            #     101	document
     139            #       2	executable
     140            #   10286	text
     141            # ~$
     142            # ~$ egrep Document /tmp/www_marxist_org.py.cd-filelist.filetypes
     143            # /cdrom/reference/archive/hegel/txt/cyril-29.htm: Microsoft Word 6.0 Document
     144            # /cdrom/reference/archive/hegel/txt/julio19.txt: Microsoft Word 6.0 Document
     145            # ~$
     146            # ~$ egrep -i wordper /tmp/www_marxist_org.py.cd-filelist.filetypes
     147            # /cdrom/history/cuba/archive/castro-sd/19940102.1: WordPerfect document
     148            # /cdrom/history/cuba/archive/castro-sd/19940102.2: WordPerfect document
     149            # /cdrom/history/cuba/archive/castro-sd/19940129.1: WordPerfect document
     150            # ~$
     151            # ~$ egrep -i execut /tmp/www_marxist_org.py.cd-filelist.filetypes
     152            # /cdrom/archive/noneng/admin/mlwerke/dsexp50.exe: MS Windows PE 32-bit Intel 80386 GUI executable
     153            # /cdrom/archive/noneng/software/wdia204z.exe: MS Windows PE 32-bit Intel 80386 GUI executable
     154            # ~$
     155            # ~$
     156            # ~$
     157    
     158            
     159            if not os.path.isfile(self.i['CD_FILELIST_FILETYPES']):
     160                # Read files:
     161                self.__stderr__("Using 'file' to determine file types..." + '\n')
     162                #
     163                filetypes = open(self.i['CD_FILELIST_FILETYPES'], 'w')
     164                #
     165                for line in fileinput.input(self.i['CD_FILELIST']):
     166                    pathfile = line[:-1]
     167    
     168                    # Quote pathfile because of names such as:
     169                    # /cdrom/archive/lenin/media/image/1918/1918(2).jpg
     170                    #
     171                    p = os.popen('file ' + "'" + pathfile + "'")
     172                    
     173                    filetype_read = p.read()
     174                    p.close()
     175    
     176                
     177                    filetype = string.split(filetype_read, ',')[0]
     178                    if filetype[-1] == '\n':
     179                        filetype = filetype[:-1]
     180                        pass
     181                    filetypes.write(filetype + '\n')
     182                    #
     183                    pass
     184                filetypes.close()
     185                #
     186                pass
     187    
     188            # Create abstract report:
     189            self.CD_FILETYPES_ABSTRACT()
     190    
     191            # Any discrepancies?
     192            #
     193            if self.i['scripting_in_progress___do_not_do_error_checks'] == None:
     194                self.ERRORS('errtype__CD_FILELIST')
     195                self.ERRORS('errtype__CD_FILELIST_FILETYPES')
     196                self.ERRORS('errtype__CD_FILETYPES_ABSTRACT')
     197                pass
     198            
     199            pass
     200    
     201    
     202        def CD_FILETYPES_ABSTRACT(self):
     203            """This attribute... """
     204    
     205            if not os.path.isfile(self.i['CD_FILETYPES_ABSTRACT']):
     206                self.__stderr__("Creating list of extensions and types..." + '\n')
     207                #
     208                results = {}
     209                for line in fileinput.input(self.i['CD_FILELIST_FILETYPES']):
     210                    array = string.split(line)
     211                    #
     212                    extension = string.upper(string.split(array[0], '.')[-1])
     213                    if extension[-1] == ':':
     214                        extension = extension[:-1]
     215                        pass
     216                    #
     217                    filetype = string.upper(array[1])
     218    
     219                    key = extension + ' ' + filetype
     220                    if not key in results.keys():
     221                        results[key] = 0
     222                        pass
     223                    
     224                    results[key] = results[key] + 1
     225                    pass
     226                
     227                # Create file with results:
     228                abstract = open(self.i['CD_FILETYPES_ABSTRACT'], 'w')
     229                keys = results.keys()
     230                keys.sort()
     231                for key in keys:
     232                    abstract.write(key + ' ' + str(results[key]) + '\n')
     233                    pass
     234                
     235                pass
     236            pass
     237    
     238    
     239        def __stderr__(self, string):
     240            """DESCRIPTION"""
     241            sys.stderr.write(string)
     242            pass
     243        def __stdout__(self, string):
     244            """DESCRIPTION"""
     245            sys.stdout.write(string)
     246            pass
     247    
     248    
     249        def __call__(self, dictionary):
     250            """DESCRIPTION"""
     251    
     252            # Module to read a file whether it's MAC, UNIX or DOS.
     253            sys.path.append(os.path.expanduser('~/bin'))
     254            import do_ma_un
     255            # DOs/MAc/UNix line handler:
     256            food_processor = do_ma_un.Do_ma_un()
     257    
     258            # Read list of files.
     259            files = {}
     260            for line in fileinput.input(self.i['CD_FILELIST_FILETYPES']):
     261                #
     262                # EXAMPLE:
     263                # /cdrom/admin/webstats/index.html: HTML document text
     264                # /cdrom/admin/webstats/info.txt: English text
     265                # /cdrom/admin/webstats/msfree.gif: GIF image data
     266                #
     267                #
     268                array = string.split(line)
     269                pathfile = array[0][:-1]   # remove ':'
     270                files[pathfile] = array[1:]
     271                pass
     272    
     273            # Process each file.
     274            keys = files.keys()
     275            keys.sort()
     276            n = 0
     277            for pathfile in keys:
     278                n = n + 1
     279    
     280                # Skip things like JPEGs, GIFs and Microsoft Word 6.0 Documents:
     281                if files[pathfile][-1] == 'text':
     282                    sys.stderr.write(str(n) + '  ' + pathfile + '\n')
     283                    #
     284                    dictionary['inputs'] = [pathfile]
     285                    food_processor(dictionary)
     286                    pass
     287                pass
     288            pass
     289    
     290    
     291        def name_search(self, dictionary):
     292    
     293            # if dictionary['search_for'].search(dictionary['logical_line']):
     294            if self.i['re']['marx'].search(dictionary['logical_line']):
     295    
     296                # A blank line marks end of observation.
     297                self.__stdout__(dictionary['input'] + '\n')
     298                self.__stdout__(dictionary['logical_line'] + '\n')
     299                self.__stdout__('' + '\n')
     300                pass
     301            pass
     302    
     303    
     304    # -----------------------------------------------------------------------------
     305        pass
     306    # -----------------------------------------------------------------------------
     307    information['scripting_in_progress___do_not_do_error_checks'] = 't'
     308    
     309    Program = Www_marxists_org(information)
     310    dictionary = {}
     311    dictionary['search_for'] = Program.i['re']['marx']
     312    dictionary['search_for'] = Program.i['re']['mandel']
     313    dictionary['function_to_call'] = Program.name_search
     314    Program(dictionary)
     315    
     316    
     317    ###
     318    # -----------------------------------------------------------------------------
     319    
     320    # 2001.06.30:
     321    # /tmp/www_marxist_org.py.cd-filelist.filetypes.abstract:
     322    #   /CDROM/ARCHIVE/TROTSKY/WORKS/1919-MIL/DD MICROSOFT 1
     323    #   /CDROM/ARCHIVE/TROTSKY/WORKS/PDF/CRAWFORD_TROTSKY_WRITINGS_ON_BR PDF 1
     324    #   /CDROM/HISTORY/ETOL/IND HTML 1
     325    #   1 ASCII 4
     326    #   1 DATA 2
     327    #   1 ENGLISH 448
     328    #   1 WORDPERFECT 2
     329    #   10 ENGLISH 1
     330    #   11 ENGLISH 1
     331    #   12 ENGLISH 1
     332    #   13 ENGLISH 1
     333    #   14 ASCII 1
     334    #   14 ENGLISH 1
     335    #   2 ASCII 1
     336    #   2 DATA 1
     337    #   2 ENGLISH 159
     338    #   2 WORDPERFECT 1
     339    #   3 ASCII 1
     340    #   3 ENGLISH 81
     341    #   4 ASCII 2
     342    #   4 C++ 1
     343    #   4 ENGLISH 36
     344    #   5 ASCII 1
     345    #   5 ENGLISH 18
     346    #   6 ASCII 1
     347    #   6 ENGLISH 9
     348    #   7 ENGLISH 4
     349    #   8 ASCII 1
     350    #   8 ENGLISH 1
     351    #   9 ENGLISH 1
     352    #   BAK HTML 3
     353    #   CSS ASCII 68
     354    #   CSS C 65
     355    #   CSS DATA 2
     356    #   CSS EXPORTED 4
     357    #   DOC MICROSOFT 2
     358    #   EXE MS 2
     359    #   GIF ASCII 2
     360    #   GIF DATA 2
     361    #   GIF GIF 887
     362    #   GIF JPEG 1
     363    #   HIST ASCII 1
     364    #   HIST~ ASCII 1
     365    #   HTACCESS ASCII 1
     366    #   HTM ASCII 39
     367    #   HTM C 2
     368    #   HTM C++ 6
     369    #   HTM DATA 107
     370    #   HTM ENGLISH 1965
     371    #   HTM EXPORTED 58
     372    #   HTM HTML 6811
     373    #   HTM INTERNATIONAL 5
     374    #   HTM MICROSOFT 1
     375    #   HTM RICH 2
     376    #   HTML ASCII 2
     377    #   HTML DATA 2
     378    #   HTML EXPORTED 9
     379    #   HTML HTML 353
     380    #   ICO DATA 5
     381    #   JPG ASCII 5
     382    #   JPG DATA 1
     383    #   JPG GIF 1
     384    #   JPG JPEG 1063
     385    #   LOG ASCII 3
     386    #   MID STANDARD 1
     387    #   MIDI STANDARD 1
     388    #   MP3 DATA 6
     389    #   MP3 MPEG 11
     390    #   MPG MPEG 1
     391    #   PDF PDF 97
     392    #   PL ENGLISH 1
     393    #   PL PERL 1
     394    #   RTF RICH 1
     395    #   SIT DATA 16
     396    #   TXT ASCII 6
     397    #   TXT DATA 7
     398    #   TXT ENGLISH 64
     399    #   TXT EXPORTED 1
     400    #   TXT HTML 1
     401    #   TXT MICROSOFT 1
     402    #   TXT NEWS 16
     403    #   TXT SMTP 17
     404    #   ZIP DATA 4
     405    #   ZIP ZIP 61
     406    #   ---
     407    
     408    #