1    #!/bin/env python
       2    
       3    #     mia_glos.py
       4    #     2001.06.10
       5    
       6    # SYNTAX: python mia_glos.py > mia_glos.dat
       7    
       8    
       9    # Generate tab-delimited list of glossary entries
      10    # (a.k.a. encyclopedia) from Marxist Internet Archives web pages...
      11    #   ---> under the assumption that web pages were not generated from a
      12    #        pre-existing list of glossary entries!!! <---
      13    #
      14    # If encyclopedia's web pages were generated from a database of
      15    # glossary terms, this script is superfluous in every way with the
      16    # exception of the inside-out-inside principle *.
      17    #
      18    #  * the "inside-out-inside" principle is what I call (without knowing
      19    #    the real term for it) a process where Y = Y' and:
      20    #         - Y is the input to process A which creates output Z, and
      21    #         - Z is the input to process B which creates output Y'
      22    #
      23    #    If Y and Y' are equal, then both processes (A and B) are "correct"
      24    #    as long as they were designed independent of one another (for
      25    #    example, written by two different people using two different
      26    #    programming languages).  The "inside-out-inside" principle is the
      27    #    only way to catch the very last error that may exist (regardless
      28    #    of what the specifications specify).
      29    
      30    
      31    # INPUT / OUTPUT
      32    #
      33    # Input:
      34    #  HTML documents from the Archives.
      35    #
      36    # Example output:
      37    # terms	volition	Volition	marxists.org/glossary/terms/v/o.htm
      38    # terms	voluntarism	Voluntarism	marxists.org/glossary/terms/v/o.htm
      39    # terms	wage-labor	Wage Labour	marxists.org/glossary/terms/w/a.htm
      40    # terms	wage-labour	Wage Labour	marxists.org/glossary/terms/w/a.htm
      41    
      42    
      43    # If "IOError: [Errno socket error] (113, 'No route to host')"
      44    # then:
      45    #  $~ http_proxy=http://proxy.lafn.org:80
      46    #  $~ export http_proxy
      47    
      48    
      49    # TO-DO
      50    #
      51    # - First version used urllib to read input from HTML pages on Web.
      52    #   Make source a choice (in addition, CD or local hard drive).
      53    
      54    # #############################################################################
      55    import os
      56    import re
      57    import sys
      58    import string
      59    import urllib
      60    
      61    information = {}
      62    information['re'] = {}
      63    
      64    information['debug'] = 't'     # See also "debugging".
      65    information['debug'] = None    # See also "debugging".
      66    
      67    
      68    # Glossary sub-sections (entry points into the glossary tree).
      69    #
      70    information['glossary_urls'] = [
      71        'http://www.marxists.org/glossary/events/index.htm',
      72        'http://www.marxists.org/glossary/orgs/index.htm',
      73        'http://www.marxists.org/glossary/people/index.htm',
      74        'http://www.marxists.org/glossary/periodicals/index.htm',
      75        'http://www.marxists.org/glossary/terms/index.htm',
      76        ]
      77    
      78    
      79    # -----------------------------------------------------------------------------
      80    # If proxy server in use:
      81    # % pon
      82    #        (Wait until ifconfig shows ppp0 interface.)
      83    # % http_proxy="http://www.someproxy.com:3128"
      84    # % export http_proxy
      85    # % python mia_glos.py
      86    
      87    
      88    #
      89    # Abstract things that need to be referred to with a variable.
      90    ##########
      91    ##########
      92    information['www_marxists_org'] = 'www.marxists.org'
      93    information['htm'] = 'htm'
      94    information['re']['http'] = 'http:'
      95    information['re']['href'] = 'href='
      96    information['re']['br_tag'] = '(.*)<br[^>]*>(.*)'
      97    information['re']['href_value'] = 'href="([^"]+)"'
      98    information['re']['a_tag_with_href'] = '<a[^>]+href=[^>]+>'
      99    
     100    # Stuff to examine index files for each sub-section of glossary.
     101    #######
     102    #######
     103    # Tags that mark beginning and ending of section with sub links.
     104    information['re']['lvl1_switch_on'] = '<map[ ]+name="letters">'
     105    information['re']['lvl1_switch_off'] = '</map>'
     106    #
     107    # Type of links with link section.
     108    information['re']['glossary_links_index'] = '^[a-z]/index.' + \
        information['htm'] + '$'
     110    information['re']['glossary_links_letter'] = '[a-z]/[a-z].' + \
        information['htm'] + '$'
     112    
     113    
     114    # Stuff to examine index files for each sub-section of glossary.
     115    #######
     116    #######
     117    information['re'][
     118        'index_links_valid'] = '^[a-z].' + \
        information['htm'] + '$'
     120    information['re'][
     121        'href_value'] = 'href="([^"]+)"'
     122    
     123    
     124    # Stuff to extract glossary entries.
     125    #######
     126    #######
     127    information['re']['entries_a_name'] = '<a[ ]+name="([^"]+)"[ ]*>'
     128    #
     129    # 2nd pair of parentheses needed to capture text if text starts on same line.
     130    p_span = '(?:p|span)'
     131    #
     132    # <p class="term" style="color: #990000">
     133    information['re']['entries_p_Term_open'] = '<' + p_span + '[ ]+class="term"[^>]*>(.*)'
     134    information['re']['entries_p__close'] = '(.*)</[ ]*' + p_span + '[ ]*>'
     135    
     136    
     137    # -----------------------------------------------------------------------------
     138    # -----------------------------------------------------------------------------
     139    # http://www.marxists.org/admin/workshop/info/html.htm#s2
     140    #
     141    # Encyclopedia notes: Most of the footnotes in the text you are
     142    # transcribing will simply be links to the Encyclopedia; i.e. when there
     143    # are words in the text that need explanation, we link those words to
     144    # the information in the Encyclopedia. If the word
     145    # (i.e. person/place/event/term/organisation/periodical) isn't already
     146    # in the Glossary, mail the definition to Brian and he'll put them
     147    # in. When that is taken care of, you need only link the word to the
     148    # Encyclopedia.
     149    # 
     150    # 1) The Encyclopedia is located at the root directory, so links to it
     151    # have to go out a five directories. The first portion of your link will
     152    # look like: <a href="../../../../../glossary/">.
     153    # 
     154    # 2) Next, find out which glossary the term is located in. We have six
     155    # glossaries: people, places, events, orgs, periodicals, and terms. To
     156    # find the word in the right glossary:
     157    #
     158    # People: Any individual 
     159    # 
     160    # Events: A time and place when something significant happened. 
     161    # 
     162    # Places: A significant location. 
     163    # 
     164    # Terms: Any definition. (i.e. not an event, person, place, periodical
     165    # or organisation - if it isn't material, it belongs here.)
     166    # 
     167    # Periodicals: Any newspaper, magazine, etc. 
     168    # 
     169    # Organisations: Any kind of established group, party, government,
     170    # dynasty, tribe, etc.
     171    # 
     172    # 3) To continue by example. Say you are linking to the word "Pravda";
     173    # at this point we know our link will be: <a
     174    # href="../../../../../glossary/periodicals/">. In order to find the
     175    # word in the glossary, you use the first two letters of the word. The
     176    # link will next go to the directory "p/", and the second letter "r" is
     177    # the html file you grab onto; so your link is now: <a
     178    # href="../../../../../glossary/periodicals/p/r.htm">
     179    # 
     180    # 4) Finally, you need to put the full word on as an anchor to the end
     181    # of the link. The final link thus looks like: <a
     182    # href="../../../../../glossary/periodicals/p/r.htm#pravda"> Use
     183    # lowercase letters for the anchors, and when there is a space in the
     184    # word, replace spaces with a hyphen.
     185    # 
     186    # Whew! Take a look at this Example to see lots of different links made
     187    # to the glossary (don't worry about linking so many words as this; you
     188    # only need to link those words that come up as footnotes in the text
     189    # you are transcribing). If this explaination just sounded confusing to
     190    # you, don't worry about it. We prefer you use the Encyclopedia, but you
     191    # can just use regular footnotes to put down these terms if you'd like.
     192    
     193    # -----------------------------------------------------------------------------
     194    # -----------------------------------------------------------------------------
     195    # Glossary entries are found three levels down:
     196    
     197    # 1.
     198    # http://marxists.org/glossary/people/index.htm
     199    # <map name="letters">
     200    #                         <area shape="rect" coords="532, 75, 570, 116"
     201    #                                 href="z/index.htm">
     202    #                         <area shape="rect" coords="491, 75, 529, 116"
     203    #                                 href="y/index.htm">
     204    
     205    # 2.
     206    # http://marxists.org/glossary/people/a/index.htm
     207    #  <td>
     208    #  <a href="k.htm">Ak</a>
     209    #   </td>
     210    #  <td>
     211    #   <a href="l.htm">Al</a>
     212    #   </td>
     213    
     214    # 3a.
     215    # http://marxists.org/glossary/people/a/b.htm
     216    #    <a name="abern-martin"></a>
     217    #    <p>&#160;</p>
     218    #    <p class="term">
     219    #    Abern, Martin  (1898-1949)
     220    #    </p>
     221    #
     222    # 3b.
     223    # http://www.marxists.org/glossary/events/w/o.htm
     224    #    <a name="tannenberg-1914"></a>
     225    #    <p class="fst">
     226    #    <span class="term">The Battle of Tannenberg:</span>
     227    # (found that one when...
     228    # 
     229    # Traceback (innermost last):
     230    #   File "mia_glos.py", line 593, in ?
     231    #     program()
     232    #   File "mia_glos.py", line 243, in __call__
     233    #     self.extract_entries()
     234    #   File "mia_glos.py", line 463, in extract_entries
     235    #     self.remember_entry_title(url)
     236    #   File "mia_glos.py", line 336, in remember_entry_title
     237    #     self.i[
     238    # KeyError: tannenberg-1914
     239    
     240    
     241    # -----------------------------------------------------------------------------
     242    # -----------------------------------------------------------------------------
     243    
     244    # FINAL NOTES:
     245    
     246    # If Web pages read by this script were valid XHTML, SAX could be used
     247    # and this script would be more "robust."  SAX is
     248    # (http://www.megginson.com/SAX/) the Simple API for XML (a standard
     249    # interface for event-based XML parsing).
     250    
     251    
     252    # Discovered while executing:
     253    #
     254    # 1. ERROR: closing p-tag must be explicitly searched for after opening p-tag
     255    #           ().
     256    #
     257    # 2. TWIST: either p or span can have class="term".
     258    #
     259    # 3. ERROR: James Burnham (people/b/u.htm) is missing a closing p-tag
     260    #           (sent email to Brian, 2001.06.11).
     261    
     262    
     263    class Mia_glos:
     264    
     265        def debug_msg(self, msg):
     266            sys.stderr.write(msg)
     267            pass
     268    
     269    
     270        def __init__(self, information):
     271            # Attribute called when class instantiated.
     272    
     273            self.i = information
     274            self.debug = self.i['debug']
     275    
     276            # There can be more than one anchor per glossary entry, for example:
     277            # <a name="hirsch-duncker"></a>
     278            # <a name="hirsch-duncker-trade-unions"></a>
     279            #
     280            self.term = None
     281            self.i['term_queue'] = []
     282    
     283            # This will have pointers to pages with glossary entries.
     284            self.i['rolodex'] = {}
     285            self.i['rolodex']['idx'] = {}
     286            self.i['rolodex']['ltr'] = {}
     287    
     288            
     289            # EXPRESSIONS:
     290            #
     291            self.br_tag = re.compile(
     292                self.i['re']['br_tag'],
     293                re.I)
     294            self.re_a_tag_with_href = re.compile(
     295                self.i['re']['a_tag_with_href'],
     296                re.I)
     297            self.re_http = re.compile(
     298                self.i['re']['http'],
     299                re.I)
     300            self.re_href = re.compile(
     301                self.i['re']['href'],
     302                re.I)
     303            self.re_href_value = re.compile(
     304                self.i['re']['href_value'],
     305                re.I)
     306            self.re_index_links_valid = re.compile(
     307                self.i['re']['index_links_valid'],
     308                re.I)
     309            self.re_entries_a_name = re.compile(
     310                self.i['re']['entries_a_name'],
     311                re.I)
     312            self.re_entries_p_Term_open = re.compile(
     313                self.i['re']['entries_p_Term_open'],
     314                re.I)
     315            self.re_entries_p__close = re.compile(
     316                self.i['re']['entries_p__close'],
     317                re.I)
     318            #
     319            # HREF= value
     320            self.re_glossary_links_index = re.compile(
     321                self.i['re']['glossary_links_index'], re.I
     322                )
     323            self.re_glossary_links_letter = re.compile(
     324                self.i['re']['glossary_links_letter'], re.I
     325                )
     326            #
     327            self.re_switch_on = re.compile(
     328                self.i['re']['lvl1_switch_on'],
     329                re.I)
     330            self.re_switch_off = re.compile(
     331                self.i['re']['lvl1_switch_off'],
     332                re.I)
     333    
     334    
     335            pass
     336    
     337    
     338        def __call__(self, dictionary):
     339            # Default attribute used when class instance called without explicit
     340            # attribute.  Like 'MAIN'.
     341    
     342            # Look at glossaries and store URLs that contain glossary entries.
     343    
     344            # Module to read a file whether it's MAC, UNIX or DOS.
     345            sys.path.append(os.path.expanduser('~/bin'))
     346            import do_ma_un
     347            # DOs/MAc/UNix line handler:
     348            self.food_processor = do_ma_un.Do_ma_un()
     349    
     350            #
     351            # -------------------------------------------------------
     352            # Real one:
     353            self.glossaries(dictionary)
     354            # -------------------------------------------------------
     355            # Short one for debugging:
     356            # self.i['rolodex']['ltr']['http://marxists.org/glossary/orgs/c/a.htm'] = {}
     357            # self.i['rolodex']['ltr']['http://marxists.org/glossary/events/c/o.htm'] = {}
     358            # self.i['rolodex']['ltr']['http://marxists.org/glossary/people/b/e.htm'] = {}
     359            # -------------------------------------------------------
     360    
     361    
     362            # Flip through rolodex and extract glossary entires.
     363            self.extract_entries()
     364    
     365            # Print final results.
     366            self.output_entries()
     367    
     368            pass
     369    
     370    
     371        def add_to_rolodex(self, type, url):
     372            """Add to collection of pointers that point to pages with entries."""
     373    
     374            # Change from relative to absolute:
     375            url = os.path.join(self.base_url, url)
     376    
     377            spacer = '  '
     378            if type == 'ltr': spacer = spacer * 2
     379            self.debug_msg(spacer + url + '\n')
     380    
     381            self.i['rolodex'][type][url] = {}
     382            return url
     383    
     384    
     385        def parse_glossary_section(self, spam):
     386            """Return glossary sub-section name."""
     387    
     388            # For example, given this string, return "orgs":
     389            # http://marxists.org/glossary/orgs/h/i.htm
     390            #
     391            return re.compile('glossary/([^/]+)', re.I).search(spam).group(1)
     392            pass
     393    
     394    
     395        def output_entries(self):
     396            """Print results."""
     397    
     398            # Sample:
     399            #
     400            # raise str(self.i['rolodex'])
     401            #
     402            # {
     403            #  'http://marxists.org/glossary/orgs/h/a.htm':
     404            #    {'habsburg-dynasty': {'entry_title': ' Habsburg Dynasty'}},
     405            #
     406            #  'http://marxists.org/glossary/orgs/h/o.htm':
     407            #    {'hohenzollern-dynasty': {'entry_title': ' Hohenzollern Dynasty'}},
     408            #
     409            #  'http://marxists.org/glossary/orgs/h/i.htm':
     410            #    {'hirsch-duncker':
     411            #      {'entry_title': ' Hirsch-Duncker Trade Unions '},
     412            #     'hirsch-duncker-trade-unions':
     413            #      {'entry_title': ' Hirsch-Duncker Trade Unions '}  }
     414            # }
     415    
     416            urls_sorted = self.i['rolodex']['ltr'].keys()
     417            urls_sorted.sort()
     418            for url_key in urls_sorted:
     419                dictionary['url'] = url_key
     420    
     421                if self.debug: self.debug_msg('\n')
     422    
     423                terms_sorted = self.i['rolodex']['ltr'][dictionary['url']].keys()
     424                terms_sorted.sort()
     425                for term in terms_sorted:
     426    
     427                    glossary_section = self.parse_glossary_section(dictionary['url'])
     428    
     429                    entry_title = self.i['rolodex']['ltr'][dictionary['url']][
     430                        term]['entry_title']
     431    
     432                    #
     433                    # DONE.
     434                    # DONE.
     435                    # DONE.
     436                    tuple = (glossary_section, entry_title, term, url_key)
     437                    tuple = (glossary_section, term, entry_title, url_key)
     438                    
     439                    sys.stdout.write(string.join(tuple, '\t') + '\n')
     440                    pass
     441                pass
     442    
     443            pass
     444    
     445    
     446        def remember_entry_title(self, url):
     447            """Store entry_title in rolodex."""
     448    
     449            # Change br tags to semi-colon:
     450            local_obj = self.br_tag.search(self.entry_title)
     451            while not local_obj == None:
     452                self.entry_title = local_obj.group(1) + ';' + local_obj.group(2)
     453                local_obj = self.br_tag.search(self.entry_title)
     454                pass        
     455    
     456            if self.debug: self.debug_msg('    title: ' + self.entry_title + '\n')
     457    
     458            #
     459            # 1 of 2.
     460            for term in self.i['term_queue']:
     461                self.i[
     462                    'rolodex']['ltr'][
     463                    url][
     464                    term][
     465                    'entry_title'] = self.entry_title
     466                pass
     467            #
     468            # 2 of 2.
     469            self.entry_title = None
     470            self.i['term_queue'] = []
     471    
     472            pass
     473    
     474    
     475        def extract_processor(self, dictionary):
     476            """Extract glossary entries from logical lines."""
     477            
     478            self.match_obj_tag_open = self.re_entries_a_name.search(
     479                dictionary['logical_line'])
     480            if not self.match_obj_tag_open == None:
     481    
     482                # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
     483                # vvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvvv
     484                # Also set to None here (in addition to top-of-file).
     485                self.entry_title = None
     486    
     487                self.term = self.match_obj_tag_open.group(1)
     488                self.i['rolodex']['ltr'][dictionary['url']][self.term] = {}
     489    
     490                # Needed to make duplicates when more than one anchor
     491                # for current glossary entry.
     492                self.i['term_queue'].append(self.term)
     493    
     494                # WARNING:
     495                # Will not work if more than one on a single line!
     496                # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     497                # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     498    
     499                if self.debug: self.debug_msg('     term: ' + self.term + '\n')
     500    
     501                pass
     502                    
     503    
     504            # Somewhat more complicated to get text surrounded by p-tags.
     505            # We can still assume closing ">" is on same line as its opening "<".
     506            # However, cannot assume anything about where text between open and
     507            # close tags starts, or ends.
     508            #
     509            # (See also .py script to parse CSS.)
     510            #
     511            # Example:
     512            #   <p class="term">
     513            #   Abern, Martin  (1898-1949)
     514            #   </p>                                
     515            
     516            self.match_obj_tag_open = self.re_entries_p_Term_open.search(
     517                dictionary['logical_line'])
     518            self.match_obj_tag_close = self.re_entries_p__close.search(
     519                dictionary['logical_line'])
     520            
     521            if not self.match_obj_tag_open == None:
     522                #
     523                # If data starts on same line as opening "p" tag, grab it!
     524                self.entry_title = self.match_obj_tag_open.group(1)
     525                
     526                # Ignore closing tag if on same line as opening tag.
     527                #
     528                # Ignore closing tag if _after_ opening tag (see
     529                # people/t/r "trotsky" for example).
     530                self.match_obj_tag_close = self.re_entries_p__close.search(
     531                    dictionary['logical_line'][self.match_obj_tag_open.start():])
     532                
     533                if not self.match_obj_tag_close == None:
     534                    
     535                    self.match_obj = self.re_entries_p__close.search(
     536                        self.entry_title)
     537                    self.entry_title = self.match_obj.group(1)
     538                    
     539                    # When open-p and close-p tags on same line:
     540                    self.remember_entry_title(dictionary['url'])
     541                    
     542                    # Avoid processing a second time immediately below!
     543                    self.match_obj_tag_close = None
     544                    
     545                    pass                    
     546                pass
     547    
     548            if not self.match_obj_tag_close == None:
     549                # TO-DO:
     550                #
     551                #   Terms are supposed to be unique.
     552                #   Might want to check for duplicates (case insensitive) both
     553                #   within this file and across all files.
     554                
     555                if not self.entry_title == None:
     556    
     557                    # Add text appearing before close tag.
     558                    #
     559                    # Cannot assume just one p__close, for example this line:
     560                    #          Belinsky, Vissarion (1811-1848)</P>
     561                    # (cont.)  <P>Russian literary critic who supported
     562                    # (cont.)  socially critical writers.</P>
     563                    #
     564                    # (Oh, how nice it would be to use SAX!!!)
     565                    #
     566                    while not self.match_obj_tag_close == None:
     567                        shrinker = self.match_obj_tag_close.group(1)
     568                        self.match_obj_tag_close = self.re_entries_p__close.search(
     569                            shrinker)
     570                        pass
     571                    #
     572                    self.entry_title = string.join(
     573                        [self.entry_title, shrinker])
     574    
     575    
     576                    # Change newlines and double-spaces to a space.
     577                    self.entry_title = re.sub('[\s]+', " ", self.entry_title)
     578                    self.entry_title = string.strip(self.entry_title)
     579                            
     580                    # When open-p and close-p tags not on same line:
     581                    self.remember_entry_title(dictionary['url'])
     582                    pass
     583    
     584                pass
     585    
     586            # Text within paragraph on a line without tags.
     587            if (self.match_obj_tag_open == None) and (
     588                self.match_obj_tag_close == None) :
     589                if not self.entry_title == None:
     590                    self.entry_title = string.join(
     591                        [self.entry_title, dictionary['logical_line']])
     592                    pass
     593                pass
     594    
     595            pass
     596        
     597    
     598        def extract_entries(self):
     599            """Create dictionary w/ keys from '<a name=' and values from '<p term='."""
     600    
     601            self.debug_msg(         '\n')
     602            self.debug_msg('-'*33 + '\n')
     603            
     604            urls_sorted = self.i['rolodex']['ltr'].keys()
     605            urls_sorted.sort()
     606    
     607            for url_key in urls_sorted:
     608                dictionary['url'] = url_key
     609    
     610                self.debug_msg('extract:  ' +url_key + '\n')
     611    
     612                self.entry_title = None
     613                #  -- -- -- -- -- -- -- -- -- --
     614                dictionary['inputs'] = [dictionary['url']]
     615                dictionary['function_to_call'] = dictionary['function_process_extract']
     616                self.food_processor(dictionary)
     617                
     618                pass
     619            pass
     620    
     621    
     622        def indexes_processor(self, dictionary):
     623            """Called once for each logical line."""
     624            
     625            if self.re_a_tag_with_href.search(dictionary['logical_line']):
     626                match_obj = self.re_href_value.search(dictionary['logical_line'])
     627                appendage = match_obj.group(1)
     628    
     629                if self.re_index_links_valid.search(appendage):
     630    
     631                    # - appendage_subdirectory = string.split(dictionary['url'], '/')[-2]
     632                    # - spam = os.path.join(appendage_subdirectory, appendage)
     633    
     634                    # These URLs contain glossary entries.
     635                    self.add_to_rolodex('ltr', appendage)
     636    
     637                    pass
     638                pass
     639            pass
     640    
     641    
     642        def indexes(self):
     643            """Read index file for a letter from a particular glossary."""
     644    
     645            # dictionary = {}
     646            dictionary['inputs'] = [dictionary['url']]
     647            dictionary['function_to_call'] = dictionary['function_process_indexes']
     648            self.food_processor(dictionary)
     649    
     650            pass
     651    
     652    
     653        def glossaries_processor(self, dictionary):
     654            """Called once for each logical line."""
     655            
     656            # Where to begin looking for sub-links:
     657            if self.re_switch_on.search(dictionary['logical_line']):
     658                self.switch = 't'
     659                pass
     660            elif self.re_switch_off.search(dictionary['logical_line']):
     661                self.switch = None
     662                pass
     663    
     664            if self.switch:
     665                if self.re_href.search(dictionary['logical_line']):
     666                    match_obj = self.re_href_value.search(dictionary['logical_line'])
     667                    appendage = match_obj.group(1)
     668    
     669                    if self.debug: self.debug_msg(appendage + '\n')
     670    
     671                    # 2001.06.10
     672                    # Exceptions!
     673                    # Usually "a/index.htm"
     674                    #
     675                    # EXCEPTION: "q/u.htm"   ("u" is only letter under "q")
     676                    # ...thus the 'elif' below.
     677    
     678                    if self.re_glossary_links_index.match(appendage):
     679                        # Pass index to attribute that processes next level.
     680                        #
     681                        self.add_to_rolodex('idx', appendage)
     682                        pass
     683                    elif self.re_glossary_links_letter.match(appendage):
     684                        #
     685                        self.add_to_rolodex('ltr', appendage)
     686                        pass
     687                    pass
     688                pass                
     689            pass
     690       
     691    
     692        def glossaries(self, dictionary):
     693            """Begin reading glossaries (terms, orgs, places, etc.)."""
     694    
     695            # Read each glossary:
     696            for url_glossary in self.i['glossary_urls']:
     697                dictionary['url'] = url_glossary
     698    
     699                self.base_url = re.sub('[^/]+$', "", url_glossary)
     700                if self.debug or 't':
     701                    self.debug_msg(         '\n')
     702                    self.debug_msg('-'*33 + '\n')
     703                    self.debug_msg('GLOSSARY: ' + url_glossary    + '\n')
     704                    pass
     705    
     706                # Switches (turn "on" certain part of page for searching).
     707                self.switch = None
     708    
     709                # dictionary = {}
     710                dictionary['inputs'] = [dictionary['url']]
     711                dictionary['function_to_call'] = dictionary['function_process_glossaries']
     712                self.food_processor(dictionary)
     713    
     714                pass
     715            
     716            # Loop through glossary indexes.
     717            keys = self.i['rolodex']['idx'].keys()
     718            keys.sort()
     719            for url_index in keys:
     720                dictionary['url'] = url_index
     721    
     722                self.base_url = re.sub('[^/]+$', "", url_index)
     723                if self.debug or 't':
     724                    self.debug_msg(         '\n')
     725                    self.debug_msg('-'*33 + '\n')
     726                    self.debug_msg('INDEX: ' + url_index    + '\n')
     727                    pass
     728    
     729                self.indexes()
     730                pass
     731            pass
     732    
     733    
     734        # End of class.
     735        # End of class.
     736        # End of class.
     737        pass
     738    
     739    
     740    # MAIN:
     741    Program = Mia_glos(information)
     742    #
     743    dictionary = {}
     744    dictionary['function_process_glossaries'] = Program.glossaries_processor
     745    dictionary['function_process_indexes'] = Program.indexes_processor
     746    dictionary['function_process_extract'] = Program.extract_processor
     747    
     748    Program(dictionary)
     749    
     750    ###
     751    #