1 #! /usr/bin/env python 2 # ----------------------------------------------------------------------------- 3 import re 4 import os 5 import fileinput 6 import string 7 from stat import * 8 import sys 9 import tempfile 10 11 12 13 information = {} 14 15 # Acceptable error level. 16 information['ERROR_LEVEL'] = 'medium' # CHECKING. 17 information['ERROR_LEVEL'] = None # FINAL. 18 19 # Script: 20 information['SCRIPT'] = 'www_marxist_org.py' 21 22 # Source of inputs: 23 information['CD'] = '/cdrom' 24 25 26 # ----------------------------------------------------------------------------- 27 class Www_marxists_org: 28 29 def __init__(self, information): 30 """DESCRIPTION""" 31 32 self.i = information 33 34 # Regular expressions: 35 self.i['re'] = {} 36 self.i['re']['marx'] = re.compile('marx', re.I) 37 # 38 # Use a negative lookahead assertion to exlude 'Mandela': 39 self.i['re']['mandel'] = re.compile('mandel', re.I) 40 self.i['re']['mandel'] = re.compile('(mandel(?!a)|germain)', re.I) 41 42 # Check for CD: 43 mount_lines = os.popen('mount').read() 44 if string.find(mount_lines, self.i['CD']) > -1: 45 self.__stderr__('Found CD.' + '\n') 46 pass 47 else: 48 raise '\nCD not mounted: ' + self.i['CD'] + '\n--\n' + mount_lines 49 50 51 # List of files on CD: 52 # 53 self.i['CD_FILELIST'] = os.path.join('/tmp', self.i['SCRIPT'] + '.cd-filelist') 54 self.i['CD_FILELIST_RAW'] = self.i['CD_FILELIST'] + '.raw' 55 self.i['CD_FILELIST_FILETYPES'] = self.i['CD_FILELIST'] + '.filetypes' 56 self.CD_FILELIST() 57 58 # Get filetypes. 59 # 60 self.i['CD_FILETYPES_ABSTRACT'] = self.i['CD_FILELIST_FILETYPES'] + '.abstract' 61 self.CD_FILELIST_FILETYPES() 62 63 pass 64 65 66 def ERRORS(self, errtype): 67 """DESCRIPTION""" 68 69 if errtype == 'errtype__CD_FILELIST': 70 # 71 size_raw = os.stat(self.i['CD_FILELIST_RAW'])[ST_SIZE] 72 size = os.stat(self.i['CD_FILELIST'])[ST_SIZE] 73 if self.i['ERROR_LEVEL'] == None: 74 if size != size_raw: 75 raise '\nUnequal!' + '\n' + \ self.i['CD_FILELIST_RAW'] + '\n' + \ self.i['CD_FILELIST'] 78 pass 79 pass 80 81 elif errtype == 'errtype__CD_FILELIST_FILETYPES': 82 pass 83 84 elif errtype == 'errtype__CD_FILETYPES_ABSTRACT': 85 # 86 # Check extension and filetype... 87 # ... all messed-up on 2001.06.30 ... 88 # ... 89 # ... 90 # ... 91 pass 92 93 else: 94 raise 'Unknown type: ' + str(errtype) 95 96 pass 97 98 99 def CD_FILELIST(self): 100 """DESCRIPTION""" 101 102 if not os.path.isfile(self.i['CD_FILELIST_RAW']): 103 self.__stderr__('Creating list of files found on CD...' + '\n') 104 105 # September 2000 has: 106 # debian:/home/cymbala# egrep "'" www-marxists-org_mandel_search_2000-09.lst 107 # /cdrom/subject/psychology/mia/archive/leontev/images/_vti_cnf/leont'ev.gif 108 # /cdrom/subject/psychology/mia/archive/leontev/images/leont'ev.gif 109 # 110 # 12,581 on September 2000. 111 # 12,579 on September 2000, without single-quotes in file name. 112 113 p = os.popen('find ' + self.i['CD'] + ' -type f ' + \ # ' | egrep -v "\'" ' + \ 115 ' > ' + self.i['CD_FILELIST_RAW']) 116 p.close() 117 pass 118 119 if not os.path.isfile(self.i['CD_FILELIST']): 120 p = os.popen('egrep -v "\'" ' + self.i['CD_FILELIST_RAW'] + \ ' > ' + self.i['CD_FILELIST']) 122 p.close() 123 pass 124 125 # Any differences? 126 # - self.ERRORS('errtype__CD_FILELIST') 127 128 pass 129 130 131 def CD_FILELIST_FILETYPES(self): 132 """This attribute... """ 133 134 # ~$ awk '{print $NF; }' /tmp/www_marxist_org.py.cd-filelist.filetypes | \ 135 # > sort | uniq -c 136 # 2 Document 137 # 2188 data 138 # 101 document 139 # 2 executable 140 # 10286 text 141 # ~$ 142 # ~$ egrep Document /tmp/www_marxist_org.py.cd-filelist.filetypes 143 # /cdrom/reference/archive/hegel/txt/cyril-29.htm: Microsoft Word 6.0 Document 144 # /cdrom/reference/archive/hegel/txt/julio19.txt: Microsoft Word 6.0 Document 145 # ~$ 146 # ~$ egrep -i wordper /tmp/www_marxist_org.py.cd-filelist.filetypes 147 # /cdrom/history/cuba/archive/castro-sd/19940102.1: WordPerfect document 148 # /cdrom/history/cuba/archive/castro-sd/19940102.2: WordPerfect document 149 # /cdrom/history/cuba/archive/castro-sd/19940129.1: WordPerfect document 150 # ~$ 151 # ~$ egrep -i execut /tmp/www_marxist_org.py.cd-filelist.filetypes 152 # /cdrom/archive/noneng/admin/mlwerke/dsexp50.exe: MS Windows PE 32-bit Intel 80386 GUI executable 153 # /cdrom/archive/noneng/software/wdia204z.exe: MS Windows PE 32-bit Intel 80386 GUI executable 154 # ~$ 155 # ~$ 156 # ~$ 157 158 159 if not os.path.isfile(self.i['CD_FILELIST_FILETYPES']): 160 # Read files: 161 self.__stderr__("Using 'file' to determine file types..." + '\n') 162 # 163 filetypes = open(self.i['CD_FILELIST_FILETYPES'], 'w') 164 # 165 for line in fileinput.input(self.i['CD_FILELIST']): 166 pathfile = line[:-1] 167 168 # Quote pathfile because of names such as: 169 # /cdrom/archive/lenin/media/image/1918/1918(2).jpg 170 # 171 p = os.popen('file ' + "'" + pathfile + "'") 172 173 filetype_read = p.read() 174 p.close() 175 176 177 filetype = string.split(filetype_read, ',')[0] 178 if filetype[-1] == '\n': 179 filetype = filetype[:-1] 180 pass 181 filetypes.write(filetype + '\n') 182 # 183 pass 184 filetypes.close() 185 # 186 pass 187 188 # Create abstract report: 189 self.CD_FILETYPES_ABSTRACT() 190 191 # Any discrepancies? 192 # 193 if self.i['scripting_in_progress___do_not_do_error_checks'] == None: 194 self.ERRORS('errtype__CD_FILELIST') 195 self.ERRORS('errtype__CD_FILELIST_FILETYPES') 196 self.ERRORS('errtype__CD_FILETYPES_ABSTRACT') 197 pass 198 199 pass 200 201 202 def CD_FILETYPES_ABSTRACT(self): 203 """This attribute... """ 204 205 if not os.path.isfile(self.i['CD_FILETYPES_ABSTRACT']): 206 self.__stderr__("Creating list of extensions and types..." + '\n') 207 # 208 results = {} 209 for line in fileinput.input(self.i['CD_FILELIST_FILETYPES']): 210 array = string.split(line) 211 # 212 extension = string.upper(string.split(array[0], '.')[-1]) 213 if extension[-1] == ':': 214 extension = extension[:-1] 215 pass 216 # 217 filetype = string.upper(array[1]) 218 219 key = extension + ' ' + filetype 220 if not key in results.keys(): 221 results[key] = 0 222 pass 223 224 results[key] = results[key] + 1 225 pass 226 227 # Create file with results: 228 abstract = open(self.i['CD_FILETYPES_ABSTRACT'], 'w') 229 keys = results.keys() 230 keys.sort() 231 for key in keys: 232 abstract.write(key + ' ' + str(results[key]) + '\n') 233 pass 234 235 pass 236 pass 237 238 239 def __stderr__(self, string): 240 """DESCRIPTION""" 241 sys.stderr.write(string) 242 pass 243 def __stdout__(self, string): 244 """DESCRIPTION""" 245 sys.stdout.write(string) 246 pass 247 248 249 def __call__(self, dictionary): 250 """DESCRIPTION""" 251 252 # Module to read a file whether it's MAC, UNIX or DOS. 253 sys.path.append(os.path.expanduser('~/bin')) 254 import do_ma_un 255 # DOs/MAc/UNix line handler: 256 food_processor = do_ma_un.Do_ma_un() 257 258 # Read list of files. 259 files = {} 260 for line in fileinput.input(self.i['CD_FILELIST_FILETYPES']): 261 # 262 # EXAMPLE: 263 # /cdrom/admin/webstats/index.html: HTML document text 264 # /cdrom/admin/webstats/info.txt: English text 265 # /cdrom/admin/webstats/msfree.gif: GIF image data 266 # 267 # 268 array = string.split(line) 269 pathfile = array[0][:-1] # remove ':' 270 files[pathfile] = array[1:] 271 pass 272 273 # Process each file. 274 keys = files.keys() 275 keys.sort() 276 n = 0 277 for pathfile in keys: 278 n = n + 1 279 280 # Skip things like JPEGs, GIFs and Microsoft Word 6.0 Documents: 281 if files[pathfile][-1] == 'text': 282 sys.stderr.write(str(n) + ' ' + pathfile + '\n') 283 # 284 dictionary['inputs'] = [pathfile] 285 food_processor(dictionary) 286 pass 287 pass 288 pass 289 290 291 def name_search(self, dictionary): 292 293 # if dictionary['search_for'].search(dictionary['logical_line']): 294 if self.i['re']['marx'].search(dictionary['logical_line']): 295 296 # A blank line marks end of observation. 297 self.__stdout__(dictionary['input'] + '\n') 298 self.__stdout__(dictionary['logical_line'] + '\n') 299 self.__stdout__('' + '\n') 300 pass 301 pass 302 303 304 # ----------------------------------------------------------------------------- 305 pass 306 # ----------------------------------------------------------------------------- 307 information['scripting_in_progress___do_not_do_error_checks'] = 't' 308 309 Program = Www_marxists_org(information) 310 dictionary = {} 311 dictionary['search_for'] = Program.i['re']['marx'] 312 dictionary['search_for'] = Program.i['re']['mandel'] 313 dictionary['function_to_call'] = Program.name_search 314 Program(dictionary) 315 316 317 ### 318 # ----------------------------------------------------------------------------- 319 320 # 2001.06.30: 321 # /tmp/www_marxist_org.py.cd-filelist.filetypes.abstract: 322 # /CDROM/ARCHIVE/TROTSKY/WORKS/1919-MIL/DD MICROSOFT 1 323 # /CDROM/ARCHIVE/TROTSKY/WORKS/PDF/CRAWFORD_TROTSKY_WRITINGS_ON_BR PDF 1 324 # /CDROM/HISTORY/ETOL/IND HTML 1 325 # 1 ASCII 4 326 # 1 DATA 2 327 # 1 ENGLISH 448 328 # 1 WORDPERFECT 2 329 # 10 ENGLISH 1 330 # 11 ENGLISH 1 331 # 12 ENGLISH 1 332 # 13 ENGLISH 1 333 # 14 ASCII 1 334 # 14 ENGLISH 1 335 # 2 ASCII 1 336 # 2 DATA 1 337 # 2 ENGLISH 159 338 # 2 WORDPERFECT 1 339 # 3 ASCII 1 340 # 3 ENGLISH 81 341 # 4 ASCII 2 342 # 4 C++ 1 343 # 4 ENGLISH 36 344 # 5 ASCII 1 345 # 5 ENGLISH 18 346 # 6 ASCII 1 347 # 6 ENGLISH 9 348 # 7 ENGLISH 4 349 # 8 ASCII 1 350 # 8 ENGLISH 1 351 # 9 ENGLISH 1 352 # BAK HTML 3 353 # CSS ASCII 68 354 # CSS C 65 355 # CSS DATA 2 356 # CSS EXPORTED 4 357 # DOC MICROSOFT 2 358 # EXE MS 2 359 # GIF ASCII 2 360 # GIF DATA 2 361 # GIF GIF 887 362 # GIF JPEG 1 363 # HIST ASCII 1 364 # HIST~ ASCII 1 365 # HTACCESS ASCII 1 366 # HTM ASCII 39 367 # HTM C 2 368 # HTM C++ 6 369 # HTM DATA 107 370 # HTM ENGLISH 1965 371 # HTM EXPORTED 58 372 # HTM HTML 6811 373 # HTM INTERNATIONAL 5 374 # HTM MICROSOFT 1 375 # HTM RICH 2 376 # HTML ASCII 2 377 # HTML DATA 2 378 # HTML EXPORTED 9 379 # HTML HTML 353 380 # ICO DATA 5 381 # JPG ASCII 5 382 # JPG DATA 1 383 # JPG GIF 1 384 # JPG JPEG 1063 385 # LOG ASCII 3 386 # MID STANDARD 1 387 # MIDI STANDARD 1 388 # MP3 DATA 6 389 # MP3 MPEG 11 390 # MPG MPEG 1 391 # PDF PDF 97 392 # PL ENGLISH 1 393 # PL PERL 1 394 # RTF RICH 1 395 # SIT DATA 16 396 # TXT ASCII 6 397 # TXT DATA 7 398 # TXT ENGLISH 64 399 # TXT EXPORTED 1 400 # TXT HTML 1 401 # TXT MICROSOFT 1 402 # TXT NEWS 16 403 # TXT SMTP 17 404 # ZIP DATA 4 405 # ZIP ZIP 61 406 # --- 407 408 # |