1    #! /usr/bin/env python
       2    # Time-stamp: <2005-04-12 20:28:08 cymbala>
       3    
       4    # Get a list of '&zzzz;' to '&#x9999;' translations from DocBook.
       5    #   What entities are defined based upon DOCTYPE?
       6    #
       7    # Testing:  (1.) turn self.debug on (2.) execute num_ents_test.py.
       8    #
       9    # ------------------
      10    # Alternative method: Just parse .ced files referenced by PSGML's ECAT.
      11    #
      12    # ------------------
      13    # Why no "&cacute;" ???
      14    #   egrep -i cacute
      15    #      /usr/lib/sgml/dtd/xhtml-1.0/xhtml*.ent
      16    #      /usr/lib/sgml/dtd/xhtml-1.1/xhtml*.ent
      17    # Has these:
      18    #  Aacute
      19    #  Eacute
      20    #  Iacute
      21    #  Oacute
      22    #  Uacute
      23    #  Yacute
      24    #  aacute
      25    #  acute
      26    #  eacute
      27    #  iacute
      28    #  oacute
      29    #  uacute
      30    #  yacute
      31    
      32    
      33    import os
      34    import re
      35    import sys
      36    import string
      37    
      38    class NumCharRef:
      39        """Numeric character references (sect. 4.6 of XML spec.)."""
      40    
      41        def __init__(self):
      42    
      43            # debug
      44            self.debug = 't'
      45            self.debug = None
      46    
      47            # Translations, such as "&quot;" to "&#34;".
      48            self.translation = {}
      49            # Section 4.6 Predefined Entities
      50            # (http://www.w3.org/TR/1998/REC-xml-19980210)
      51            self.translation['&lt;'] = '&#60;'
      52            self.translation['&gt;'] = '&#62;'
      53            self.translation['&amp;'] = '&#38;'
      54            self.translation['&apos;'] = '&#39;'
      55            self.translation['&quot;'] = '&#34;'
      56    
      57            # Where comments (<!-- -->) start and stop; no need
      58            #  to change entities within comments.
      59            self.comments_bounds = []
      60    
      61            # Full path and name of DTD
      62            self.dtd_list = []
      63    
      64            # Original entity definitions.
      65            self.entities_list = []
      66            
      67        def get_doctype(self, xml_pathfile):
      68            """Return <!DOCTYPE"""
      69            file = open(xml_pathfile, 'r')
      70            #
      71            # Get the <!DOCTYPE...
      72            doctype = ''
      73            while 1:
      74                line = file.readline()
      75                # Do not do this: line = re.sub('\n$', '', line)
      76                line = re.sub('.*(<!DOCTYPE)', r'\1', line)
      77    
      78                if re.match('^<!DOCTYPE', line): doctype = line
      79                elif doctype: doctype = doctype + ' ' + line
      80    
      81                if doctype:
      82                    doctype = re.sub('^(<!DOCTYPE[^>]+>).*', '\\1', doctype)
      83    
      84                if doctype and re.search('>$', doctype):
      85                    doctype = re.sub(' *\n *', ' ', doctype)
      86                    break
      87                if not line:
      88                    break
      89            file.close()
      90    
      91            # Test doctype string...
      92            doctype = re.sub('[^>]+$', '', doctype)
      93            if not re.match('^<!DOCTYPE[^>]+>$', doctype):
      94                # Files that are included in other documents won't have DOCTYPE.
      95                # raise 'Invalid DOCTYPE: ' + doctype
      96                pass
      97            else:
      98                self.doctype = doctype
      99                pass
     100    
     101            if self.debug: print 'get_doctype :: ' + doctype
     102            #<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
     103            #  "dtd/xhtml1-transitional.dtd">
     104            #
     105            return doctype
     106    
     107        
     108        def pathfile_from_ents(self, entity, dtd):
     109            """Return pathfile of entity file."""
     110    
     111            # self.LOG = open('/home/cymbala/Db/Homepage/num_ents_pathfile_from_ents.log', 'w')
     112            # self.LOG.write('  ... entity = ' + entity + '\n')
     113            # self.LOG.write('  ... dtd = ' + dtd + '\n')
     114            # self.LOG.close()
     115            
     116            #ent = os.path.dirname(dtd)
     117            #ent = os.path.join(ent, re.search('[^"]+[.]ent', entity).group(0))
     118    
     119            ent = '/usr/share/sgml/html/dtd/xml/1.0/xhtml-special.ent'
     120            return ent
     121    
     122        
     123        def pathfile_from_dt(self, definition, sgml_catalog_files):
     124            """Return SGML_CATALOG_FILE & pathfile of document type definition."""
     125    
     126            # Call example:
     127            # dtd = self.pathfile_from_dt(doctype, sgml_catalog_file)
     128            
     129            # Example:
     130            # <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
     131            #  "dtd/xhtml1-transitional.dtd">
     132    
     133            # Find SGML catalog
     134            # /usr/lib/sgml/catalog
     135    
     136            # This stopped working when /usr/lib/sgml/catalog was changed
     137            # to this (circa mid 2003):
     138            # -- AUTOMATICALLY GENERATED, DO NOT EDIT --
     139            # CATALOG "/usr/lib/sgml/stylesheet/dsssl/sgmltools/sgmltools.cat"
     140            # CATALOG /usr/lib/sgml/transitional.cat
     141            # CATALOG /etc/sgml/jade.cat
     142            # CATALOG /etc/sgml/sgml-data.cat
     143    
     144            # Now we have this:
     145            # ## /etc/sgml/sgml-data.cat : SGML centralized catalog
     146            # ## Please use update-catalog(8) to modify this file.
     147            # CATALOG /usr/share/sgml/html/dtd/xml/1.0/xhtml.soc
     148    
     149            # Now we have this:
     150            # /usr/share/sgml/html/dtd/xml/1.0/xhtml.soc
     151            # PUBLIC  "-//W3C//DTD XHTML 1.0 Strict//EN"  "xhtml1-strict.dtd"
     152            # DTDDECL "-//W3C//DTD XHTML 1.0 Strict//EN"  "xhtml1.dcl"
     153    
     154            # How to parse /usr/lib/sgml/catalog ???
     155            # Don't know.  Let's just hard-code the one file I need.
     156            # SGML_CATALOG_FILES = os.environ.get('SGML_CATALOG_FILES')
     157            # SGML_CATALOG_FILES = '/usr/share/sgml/html/dtd/xml/1.0/xhtml.soc'
     158            # SGML_CATALOG_FILES = sgml_catalog_file
     159            # 2005.04.12: pass list of potential files to this def
     160    
     161            self.LOG = open('/home/cymbala/Db/Homepage/num_ents_pathfile_from_dt.log', 'w')
     162    
     163            matchobj = None
     164            for f in sgml_catalog_files:
     165                self.LOG.write('  ...for f in sgml_catalog_files... ' + f + '\n')
     166                if (not os.path.isfile(f)):
     167                    self.LOG.write('     ' + 'FILE NOT FOUND: ' + f + '\n')
     168                    pass
     169        
     170                if matchobj == None:
     171                    file = open(f, 'r')
     172                    catalog = file.read()
     173                    file.close()
     174    
     175                    verbal = re.sub('^[^"]+', '', definition)
     176                    verbal = re.sub('("[^"]+").*', r'\1', verbal)
     177    
     178                    # Need PUBLIC (as opposed to DTDDECL)
     179                    verbal_re = "PUBLIC +" + verbal
     180                    # start = string.find(catalog, verbal)
     181                    matchobj = re.search(verbal_re, catalog)
     182    
     183                    if not matchobj == None:
     184                        SGML_CATALOG_FILE = f
     185                        self.LOG.write(' ! SGML_CATALOG_FILE = ' + f + '\n')
     186                        pass
     187                    pass
     188                pass
     189            
     190            if matchobj == None:
     191                raise ': Did not find: ' + verbal_re
     192    
     193            dtd_ending = catalog[matchobj.start():]
     194            start = string.find(dtd_ending, verbal)
     195            dtd_ending = dtd_ending[(start + len(verbal)):]
     196            dtd_ending = re.sub('^[\t ]+', '', dtd_ending)
     197            dtd_ending = re.sub('"', '', dtd_ending)
     198    
     199            # No spaces in file names ("locate ' '").
     200            chop_rest_of_file = re.compile('[\n\t ].*', re.DOTALL)
     201            dtd_ending = re.sub(chop_rest_of_file, '', dtd_ending)
     202    
     203            rootpath = re.sub('.[^/]+$', '', SGML_CATALOG_FILE)
     204            dtd = os.path.join(rootpath, dtd_ending)
     205    
     206            if not os.path.isfile(dtd):
     207                raise 'File not found: ' + dtd
     208    
     209            self.LOG.write(' ! dtd = ' + dtd + '\n')
     210            self.dtd_list.append(dtd)
     211    
     212            if self.debug: print 'pathfile_from_dt :: ' + dtd
     213            # /usr/lib/sgml/dtd/xhtml-1.0/xhtml1-transitional.dtd
     214            #
     215            self.LOG.close()
     216            return [SGML_CATALOG_FILE, dtd]
     217    
     218        def get_entities(self, definition, type):
     219            """Extract entity declarations."""
     220            re_entities = re.compile('<!ENTITY[^>]+>', re.DOTALL)
     221            entities = re_entities.findall(definition)
     222    
     223            if self.debug: print 'get_entities :: ' + str(entities)
     224    
     225            if type == 'dot_ent_files':
     226                re_entity_phrases = re.compile(
     227                    '(ENTITIES|[.]ent)', re.I)
     228            elif type == 'character_entities':
     229                # iso-lat1:
     230                #  ... r'^<!ENTITY[\t ]+[^\t ]+[\t ]+"&#[0-9]+;"[\t ]*>$', re.I)
     231                # iso-lat1 and iso-lat2:
     232                re_entity_phrases = re.compile(
     233                    r'^<!ENTITY[\t ]+[^\t ]+[\t ]+"&#x?[0-9A-F]+;"[\t ]*>$', re.I)
     234            else:
     235                raise 'Type not recognized: ' + type
     236        
     237            del_list = []
     238            for i in range(len(entities)):
     239                entities[i] = re.sub('\n', ' ', entities[i])
     240                if not re_entity_phrases.search(entities[i]):
     241                    del_list.append(i)
     242    
     243            # Discard entity declarations that are not about entities.
     244            del_list.reverse()
     245            for i in del_list:
     246                del entities[i]
     247    
     248            self.entities_list.append(entities)
     249            return entities
     250    
     251        def get_translation(self, entities, dtd):
     252            """Dictionary: General entities to numeric character references."""
     253    
     254            for i in range(len(entities)):
     255                entity = re.sub('\n', ' ', entities[i])
     256                ent = self.pathfile_from_ents(entity, dtd)
     257    
     258                # Example:
     259                # <!ENTITY % HTMLspecial PUBLIC
     260                # "-//W3C//ENTITIES Special for XHTML//EN"
     261                # "http://www.w3.org/TR/xhtml1/DTD/xhtml-special.ent">
     262                #
     263                # /usr/share/sgml/html/dtd/xml/1.0/xhtml-special.ent
     264    
     265                # Example:
     266                # <!ENTITY cent   "&#162;"> <!-- cent sign, U+00A2 ISOnum -->
     267    
     268                file = open(ent, 'r')
     269                definition = file.read()
     270                file.close()
     271                if self.debug: print 'get_translation :: ' + ent
     272    
     273                ent_entities = self.get_entities(definition,
     274                                                     'character_entities')
     275    
     276                if self.debug: print 'get_translation :: ' + str(ent_entities)
     277                # ['<!ENTITY abreve\011"&#x0103;">',
     278                #  '<!ENTITY Abreve\011"&#x0102;">', ... ...
     279    
     280                
     281                for i in range(len(ent_entities)):
     282                    # ['<!ENTITY nbsp   "&#160;">',
     283                    if self.debug: print 'get_translation :: ' + ent_entities[i]
     284                    #
     285                    array = string.split(ent_entities[i])
     286                    key = '&' + array[1] + ';'
     287                    value = re.sub('[">]', '', array[2])
     288                    if key in self.translation.keys():
     289                        if value != self.translation[key]:
     290                            #
     291                            # Redundant: &OElig;: &#338; != &#x0152;
     292                            # raise 'Redundant: ' + key + ': ' \
     293                            #       + value + ' != ' + self.translation[key]
     294                            if self.translation[key][:3] == '&#x':
     295                                self.translation[key] = value
     296                        else:
     297                            # Redundant: &quot;: &#34; != &#34;
     298                            # raise 'Redundant: ' + key + ': ' \
     299                            #       + value + ' != ' + self.translation[key]
     300                            pass
     301                    else:
     302                        self.translation[key] = value
     303    
     304                    # {'&rarr;': '&#8594;', '&beta;': '&#946;', ...
     305    
     306                if self.debug:
     307                    print 'get_translation :: COUNT: ' + str(len(self.translation))
     308                    print '\n'
     309    
     310        def comments_locations(self, string):
     311            re_comment = re.compile('<!--.*?-->')
     312            start_pos = 0
     313            bounds = []
     314            while 1:
     315                matchobj = re_comment.search(string, start_pos)
     316                if matchobj == None:
     317                    break
     318                start_pos = matchobj.end()
     319                bounds.append((matchobj.start(), matchobj.end()))
     320            return bounds
     321    
     322        def entity_replace(self, matchobj):
     323            # If inside a comment, return untranslated.
     324            for i in range(len(self.comments_bounds)):
     325                if matchobj.start(0) > self.comments_bounds[i][0] and \
                   matchobj.end(0) < self.comments_bounds[i][1]:
     327                    return matchobj.group(0)
     328            if matchobj.group(0) in self.translation.keys():
     329                return self.translation[matchobj.group(0)]
     330            else:
     331                # Give up, no translation available.
     332                # print self.translation
     333                # print ' start: ' + str(matchobj.start(0))
     334                # print '   end: ' + str(matchobj.end(0))
     335                # print str(self.comments_bounds)
     336                #
     337                # raise 'Unknown entity: ' + matchobj.group(0)
     338                #
     339                # Too many exceptions!  2001.05.02
     340                return matchobj.group(0)
     341    
     342            # Why aren't /usr/lib/sgml/entities/iso-lat2.ent entities defined?
     343            #       dtd/xhtml-1.0/xhtml1-transitional.dtd
     344            
     345            #       1	&z12;  &frac12;
     346            #       1	&z14;  &frac14;
     347            #       1	&z1;
     348            #       1	&z2;
     349            #       1	&z34;
     350            #       1	&z3;
     351            #       1	&z4;
     352            #     244	&z;
     353                        
     354        def replace_entities(self, data):
     355            """Change general entities to numeric entities."""
     356            
     357            data = re.sub('&[a-zA-Z]+[1-4]?;', self.entity_replace, data)
     358            return data
     359    
     360        def __call__(self, xml_pathfile, sgml_catalog_files):
     361            """Given file name, return contents with entities replaced."""
     362    
     363            file = open(xml_pathfile, 'r')
     364            file_contents = file.read()
     365            file.close()
     366    
     367            doctype = self.get_doctype(xml_pathfile)
     368            if doctype == '':
     369                return file_contents
     370    
     371            # Get name of file from verbal description of document type definition.
     372            rc = self.pathfile_from_dt(doctype, sgml_catalog_files)
     373            sgml_catalog_file = rc[0]
     374            dtd = rc[1] 
     375    
     376            if self.debug:
     377                print '__call__ :: sgml_catalog_file : ' + sgml_catalog_file
     378                print '__call__ :: dtd : ' + dtd
     379                pass
     380    
     381            # Read document type definition, from, e.g.,
     382            # /usr/share/sgml/html/dtd/xml/1.0/xhtml1-transitional.dtd
     383            # 2005.04.12: /usr/share/sgml/docbook/dtd/xml/3.1.7/docbookx.dtd
     384            file = open(dtd, 'r')
     385            definition = file.read()
     386            file.close()
     387    
     388            # WAS: /usr/lib/sgml/dtd/xhtml-1.0/xhtml1-transitional.dtd
     389            # IS:  /usr/share/sgml/html/dtd/xml/1.0/xhtml1-transitional.dtd
     390            # <!ENTITY % HTMLsymbol PUBLIC
     391            #    "-//W3C//ENTITIES Symbols for XHTML//EN"
     392            #    "http://www.w3.org/TR/xhtml1/DTD/xhtml-symbol.ent">
     393            # %HTMLsymbol;
     394    
     395            entities = self.get_entities(definition, 'dot_ent_files')
     396    
     397            if self.debug: print '__call__ :: ' + str(entities)
     398            # ['<!ENTITY % HTMLlat2 PUBLIC
     399            #    "ISO 8879:1986//ENTITIES Added Latin 2//EN//XML"
     400            #    "http://www.w3.org/TR/xhtml1/DTD/xhtml-lat1.ent">',
     401            #  '<!ENTITY % HTMLlat1 PUBLIC
     402            #    "-//W3C//ENTITIES Latin 1 for XHTML//EN"  ...  ...  ...
     403            
     404            self.get_translation(entities, dtd)
     405    
     406            self.comments_bounds = self.comments_locations(file_contents)
     407            file_contents = self.replace_entities(file_contents)
     408    
     409            # Final results:
     410            return file_contents
     411    
     412        # ------------------------------------------------------------------------
     413    
     414    if __name__ == '__rain_in_Spain_falls_mainly_on_plain__':
     415    
     416        my_class = NumCharRef()
     417        new_file = my_class('/home/cymbala/Db/Homepage/index.xml')
     418        print new_file
     419    
     420        # Intermediate pieces:
     421        # print my_class.doctype
     422        # print ''
     423        # print str(my_class.dtd_list)
     424        # print ''
     425        # print str(my_class.entities_list)
     426        # print ''
     427        # print str(my_class.translation)
     428        # print ''
     429        # print str(my_class.comments_bounds)
     430        # print ''
     431    
     432    ###
     433    #
     434    # Local variables:
     435    # py-indent-offset: 4
     436    # End: