1 #! /usr/bin/env python 2 # Time-stamp: <2005-04-12 20:28:08 cymbala> 3 4 # Get a list of '&zzzz;' to '香' translations from DocBook. 5 # What entities are defined based upon DOCTYPE? 6 # 7 # Testing: (1.) turn self.debug on (2.) execute num_ents_test.py. 8 # 9 # ------------------ 10 # Alternative method: Just parse .ced files referenced by PSGML's ECAT. 11 # 12 # ------------------ 13 # Why no "ć" ??? 14 # egrep -i cacute 15 # /usr/lib/sgml/dtd/xhtml-1.0/xhtml*.ent 16 # /usr/lib/sgml/dtd/xhtml-1.1/xhtml*.ent 17 # Has these: 18 # Aacute 19 # Eacute 20 # Iacute 21 # Oacute 22 # Uacute 23 # Yacute 24 # aacute 25 # acute 26 # eacute 27 # iacute 28 # oacute 29 # uacute 30 # yacute 31 32 33 import os 34 import re 35 import sys 36 import string 37 38 class NumCharRef: 39 """Numeric character references (sect. 4.6 of XML spec.).""" 40 41 def __init__(self): 42 43 # debug 44 self.debug = 't' 45 self.debug = None 46 47 # Translations, such as """ to """. 48 self.translation = {} 49 # Section 4.6 Predefined Entities 50 # (http://www.w3.org/TR/1998/REC-xml-19980210) 51 self.translation['<'] = '<' 52 self.translation['>'] = '>' 53 self.translation['&'] = '&' 54 self.translation['''] = ''' 55 self.translation['"'] = '"' 56 57 # Where comments (<!-- -->) start and stop; no need 58 # to change entities within comments. 59 self.comments_bounds = [] 60 61 # Full path and name of DTD 62 self.dtd_list = [] 63 64 # Original entity definitions. 65 self.entities_list = [] 66 67 def get_doctype(self, xml_pathfile): 68 """Return <!DOCTYPE""" 69 file = open(xml_pathfile, 'r') 70 # 71 # Get the <!DOCTYPE... 72 doctype = '' 73 while 1: 74 line = file.readline() 75 # Do not do this: line = re.sub('\n$', '', line) 76 line = re.sub('.*(<!DOCTYPE)', r'\1', line) 77 78 if re.match('^<!DOCTYPE', line): doctype = line 79 elif doctype: doctype = doctype + ' ' + line 80 81 if doctype: 82 doctype = re.sub('^(<!DOCTYPE[^>]+>).*', '\\1', doctype) 83 84 if doctype and re.search('>$', doctype): 85 doctype = re.sub(' *\n *', ' ', doctype) 86 break 87 if not line: 88 break 89 file.close() 90 91 # Test doctype string... 92 doctype = re.sub('[^>]+$', '', doctype) 93 if not re.match('^<!DOCTYPE[^>]+>$', doctype): 94 # Files that are included in other documents won't have DOCTYPE. 95 # raise 'Invalid DOCTYPE: ' + doctype 96 pass 97 else: 98 self.doctype = doctype 99 pass 100 101 if self.debug: print 'get_doctype :: ' + doctype 102 #<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 103 # "dtd/xhtml1-transitional.dtd"> 104 # 105 return doctype 106 107 108 def pathfile_from_ents(self, entity, dtd): 109 """Return pathfile of entity file.""" 110 111 # self.LOG = open('/home/cymbala/Db/Homepage/num_ents_pathfile_from_ents.log', 'w') 112 # self.LOG.write(' ... entity = ' + entity + '\n') 113 # self.LOG.write(' ... dtd = ' + dtd + '\n') 114 # self.LOG.close() 115 116 #ent = os.path.dirname(dtd) 117 #ent = os.path.join(ent, re.search('[^"]+[.]ent', entity).group(0)) 118 119 ent = '/usr/share/sgml/html/dtd/xml/1.0/xhtml-special.ent' 120 return ent 121 122 123 def pathfile_from_dt(self, definition, sgml_catalog_files): 124 """Return SGML_CATALOG_FILE & pathfile of document type definition.""" 125 126 # Call example: 127 # dtd = self.pathfile_from_dt(doctype, sgml_catalog_file) 128 129 # Example: 130 # <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 131 # "dtd/xhtml1-transitional.dtd"> 132 133 # Find SGML catalog 134 # /usr/lib/sgml/catalog 135 136 # This stopped working when /usr/lib/sgml/catalog was changed 137 # to this (circa mid 2003): 138 # -- AUTOMATICALLY GENERATED, DO NOT EDIT -- 139 # CATALOG "/usr/lib/sgml/stylesheet/dsssl/sgmltools/sgmltools.cat" 140 # CATALOG /usr/lib/sgml/transitional.cat 141 # CATALOG /etc/sgml/jade.cat 142 # CATALOG /etc/sgml/sgml-data.cat 143 144 # Now we have this: 145 # ## /etc/sgml/sgml-data.cat : SGML centralized catalog 146 # ## Please use update-catalog(8) to modify this file. 147 # CATALOG /usr/share/sgml/html/dtd/xml/1.0/xhtml.soc 148 149 # Now we have this: 150 # /usr/share/sgml/html/dtd/xml/1.0/xhtml.soc 151 # PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "xhtml1-strict.dtd" 152 # DTDDECL "-//W3C//DTD XHTML 1.0 Strict//EN" "xhtml1.dcl" 153 154 # How to parse /usr/lib/sgml/catalog ??? 155 # Don't know. Let's just hard-code the one file I need. 156 # SGML_CATALOG_FILES = os.environ.get('SGML_CATALOG_FILES') 157 # SGML_CATALOG_FILES = '/usr/share/sgml/html/dtd/xml/1.0/xhtml.soc' 158 # SGML_CATALOG_FILES = sgml_catalog_file 159 # 2005.04.12: pass list of potential files to this def 160 161 self.LOG = open('/home/cymbala/Db/Homepage/num_ents_pathfile_from_dt.log', 'w') 162 163 matchobj = None 164 for f in sgml_catalog_files: 165 self.LOG.write(' ...for f in sgml_catalog_files... ' + f + '\n') 166 if (not os.path.isfile(f)): 167 self.LOG.write(' ' + 'FILE NOT FOUND: ' + f + '\n') 168 pass 169 170 if matchobj == None: 171 file = open(f, 'r') 172 catalog = file.read() 173 file.close() 174 175 verbal = re.sub('^[^"]+', '', definition) 176 verbal = re.sub('("[^"]+").*', r'\1', verbal) 177 178 # Need PUBLIC (as opposed to DTDDECL) 179 verbal_re = "PUBLIC +" + verbal 180 # start = string.find(catalog, verbal) 181 matchobj = re.search(verbal_re, catalog) 182 183 if not matchobj == None: 184 SGML_CATALOG_FILE = f 185 self.LOG.write(' ! SGML_CATALOG_FILE = ' + f + '\n') 186 pass 187 pass 188 pass 189 190 if matchobj == None: 191 raise ': Did not find: ' + verbal_re 192 193 dtd_ending = catalog[matchobj.start():] 194 start = string.find(dtd_ending, verbal) 195 dtd_ending = dtd_ending[(start + len(verbal)):] 196 dtd_ending = re.sub('^[\t ]+', '', dtd_ending) 197 dtd_ending = re.sub('"', '', dtd_ending) 198 199 # No spaces in file names ("locate ' '"). 200 chop_rest_of_file = re.compile('[\n\t ].*', re.DOTALL) 201 dtd_ending = re.sub(chop_rest_of_file, '', dtd_ending) 202 203 rootpath = re.sub('.[^/]+$', '', SGML_CATALOG_FILE) 204 dtd = os.path.join(rootpath, dtd_ending) 205 206 if not os.path.isfile(dtd): 207 raise 'File not found: ' + dtd 208 209 self.LOG.write(' ! dtd = ' + dtd + '\n') 210 self.dtd_list.append(dtd) 211 212 if self.debug: print 'pathfile_from_dt :: ' + dtd 213 # /usr/lib/sgml/dtd/xhtml-1.0/xhtml1-transitional.dtd 214 # 215 self.LOG.close() 216 return [SGML_CATALOG_FILE, dtd] 217 218 def get_entities(self, definition, type): 219 """Extract entity declarations.""" 220 re_entities = re.compile('<!ENTITY[^>]+>', re.DOTALL) 221 entities = re_entities.findall(definition) 222 223 if self.debug: print 'get_entities :: ' + str(entities) 224 225 if type == 'dot_ent_files': 226 re_entity_phrases = re.compile( 227 '(ENTITIES|[.]ent)', re.I) 228 elif type == 'character_entities': 229 # iso-lat1: 230 # ... r'^<!ENTITY[\t ]+[^\t ]+[\t ]+"&#[0-9]+;"[\t ]*>$', re.I) 231 # iso-lat1 and iso-lat2: 232 re_entity_phrases = re.compile( 233 r'^<!ENTITY[\t ]+[^\t ]+[\t ]+"&#x?[0-9A-F]+;"[\t ]*>$', re.I) 234 else: 235 raise 'Type not recognized: ' + type 236 237 del_list = [] 238 for i in range(len(entities)): 239 entities[i] = re.sub('\n', ' ', entities[i]) 240 if not re_entity_phrases.search(entities[i]): 241 del_list.append(i) 242 243 # Discard entity declarations that are not about entities. 244 del_list.reverse() 245 for i in del_list: 246 del entities[i] 247 248 self.entities_list.append(entities) 249 return entities 250 251 def get_translation(self, entities, dtd): 252 """Dictionary: General entities to numeric character references.""" 253 254 for i in range(len(entities)): 255 entity = re.sub('\n', ' ', entities[i]) 256 ent = self.pathfile_from_ents(entity, dtd) 257 258 # Example: 259 # <!ENTITY % HTMLspecial PUBLIC 260 # "-//W3C//ENTITIES Special for XHTML//EN" 261 # "http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent"> 262 # 263 # /usr/share/sgml/html/dtd/xml/1.0/xhtml-special.ent 264 265 # Example: 266 # <!ENTITY cent "¢"> <!-- cent sign, U+00A2 ISOnum --> 267 268 file = open(ent, 'r') 269 definition = file.read() 270 file.close() 271 if self.debug: print 'get_translation :: ' + ent 272 273 ent_entities = self.get_entities(definition, 274 'character_entities') 275 276 if self.debug: print 'get_translation :: ' + str(ent_entities) 277 # ['<!ENTITY abreve\011"ă">', 278 # '<!ENTITY Abreve\011"Ă">', ... ... 279 280 281 for i in range(len(ent_entities)): 282 # ['<!ENTITY nbsp " ">', 283 if self.debug: print 'get_translation :: ' + ent_entities[i] 284 # 285 array = string.split(ent_entities[i]) 286 key = '&' + array[1] + ';' 287 value = re.sub('[">]', '', array[2]) 288 if key in self.translation.keys(): 289 if value != self.translation[key]: 290 # 291 # Redundant: Œ: Œ != Œ 292 # raise 'Redundant: ' + key + ': ' \ 293 # + value + ' != ' + self.translation[key] 294 if self.translation[key][:3] == '&#x': 295 self.translation[key] = value 296 else: 297 # Redundant: ": " != " 298 # raise 'Redundant: ' + key + ': ' \ 299 # + value + ' != ' + self.translation[key] 300 pass 301 else: 302 self.translation[key] = value 303 304 # {'→': '→', 'β': 'β', ... 305 306 if self.debug: 307 print 'get_translation :: COUNT: ' + str(len(self.translation)) 308 print '\n' 309 310 def comments_locations(self, string): 311 re_comment = re.compile('<!--.*?-->') 312 start_pos = 0 313 bounds = [] 314 while 1: 315 matchobj = re_comment.search(string, start_pos) 316 if matchobj == None: 317 break 318 start_pos = matchobj.end() 319 bounds.append((matchobj.start(), matchobj.end())) 320 return bounds 321 322 def entity_replace(self, matchobj): 323 # If inside a comment, return untranslated. 324 for i in range(len(self.comments_bounds)): 325 if matchobj.start(0) > self.comments_bounds[i][0] and \ matchobj.end(0) < self.comments_bounds[i][1]: 327 return matchobj.group(0) 328 if matchobj.group(0) in self.translation.keys(): 329 return self.translation[matchobj.group(0)] 330 else: 331 # Give up, no translation available. 332 # print self.translation 333 # print ' start: ' + str(matchobj.start(0)) 334 # print ' end: ' + str(matchobj.end(0)) 335 # print str(self.comments_bounds) 336 # 337 # raise 'Unknown entity: ' + matchobj.group(0) 338 # 339 # Too many exceptions! 2001.05.02 340 return matchobj.group(0) 341 342 # Why aren't /usr/lib/sgml/entities/iso-lat2.ent entities defined? 343 # dtd/xhtml-1.0/xhtml1-transitional.dtd 344 345 # 1 &z12; ½ 346 # 1 &z14; ¼ 347 # 1 &z1; 348 # 1 &z2; 349 # 1 &z34; 350 # 1 &z3; 351 # 1 &z4; 352 # 244 &z; 353 354 def replace_entities(self, data): 355 """Change general entities to numeric entities.""" 356 357 data = re.sub('&[a-zA-Z]+[1-4]?;', self.entity_replace, data) 358 return data 359 360 def __call__(self, xml_pathfile, sgml_catalog_files): 361 """Given file name, return contents with entities replaced.""" 362 363 file = open(xml_pathfile, 'r') 364 file_contents = file.read() 365 file.close() 366 367 doctype = self.get_doctype(xml_pathfile) 368 if doctype == '': 369 return file_contents 370 371 # Get name of file from verbal description of document type definition. 372 rc = self.pathfile_from_dt(doctype, sgml_catalog_files) 373 sgml_catalog_file = rc[0] 374 dtd = rc[1] 375 376 if self.debug: 377 print '__call__ :: sgml_catalog_file : ' + sgml_catalog_file 378 print '__call__ :: dtd : ' + dtd 379 pass 380 381 # Read document type definition, from, e.g., 382 # /usr/share/sgml/html/dtd/xml/1.0/xhtml1-transitional.dtd 383 # 2005.04.12: /usr/share/sgml/docbook/dtd/xml/3.1.7/docbookx.dtd 384 file = open(dtd, 'r') 385 definition = file.read() 386 file.close() 387 388 # WAS: /usr/lib/sgml/dtd/xhtml-1.0/xhtml1-transitional.dtd 389 # IS: /usr/share/sgml/html/dtd/xml/1.0/xhtml1-transitional.dtd 390 # <!ENTITY % HTMLsymbol PUBLIC 391 # "-//W3C//ENTITIES Symbols for XHTML//EN" 392 # "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent"> 393 # %HTMLsymbol; 394 395 entities = self.get_entities(definition, 'dot_ent_files') 396 397 if self.debug: print '__call__ :: ' + str(entities) 398 # ['<!ENTITY % HTMLlat2 PUBLIC 399 # "ISO 8879:1986//ENTITIES Added Latin 2//EN//XML" 400 # "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">', 401 # '<!ENTITY % HTMLlat1 PUBLIC 402 # "-//W3C//ENTITIES Latin 1 for XHTML//EN" ... ... ... 403 404 self.get_translation(entities, dtd) 405 406 self.comments_bounds = self.comments_locations(file_contents) 407 file_contents = self.replace_entities(file_contents) 408 409 # Final results: 410 return file_contents 411 412 # ------------------------------------------------------------------------ 413 414 if __name__ == '__rain_in_Spain_falls_mainly_on_plain__': 415 416 my_class = NumCharRef() 417 new_file = my_class('/home/cymbala/Db/Homepage/index.xml') 418 print new_file 419 420 # Intermediate pieces: 421 # print my_class.doctype 422 # print '' 423 # print str(my_class.dtd_list) 424 # print '' 425 # print str(my_class.entities_list) 426 # print '' 427 # print str(my_class.translation) 428 # print '' 429 # print str(my_class.comments_bounds) 430 # print '' 431 432 ### 433 # 434 # Local variables: 435 # py-indent-offset: 4 436 # End: |