Zdrojový kód pro aleph.aleph

#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# Interpreter version: python 2.7
#
# Imports =====================================================================
"""
Aleph X-Service wrapper.

This module allows you to query Aleph's X-Services_ module (Aleph server is
defined by :attr:`aleph.settings.ALEPH_URL` in :mod:`settings.py
<aleph.settings>`).

.. _X-Services: http://www.exlibrisgroup.com/category/MetaLibXServer

There are two levels of abstraction.

Lowlevel
========

You can use this functions to access Aleph::

    searchInAleph(base, phrase, considerSimilar, field)
    downloadRecords(search_result, [from_doc])
    getDocumentIDs(aleph_search_result, [number_of_docs])
    downloadMARCXML(doc_id, library)
    downloadMARCOAI(doc_id, base)

Workflow
********

Aleph works in strange way, that he won't allow you to access desired
information directly.

You have to create search request by calling :func:`searchInAleph` first, which
will return dictionary with few important informations about session.

This dictionary can be later used as parameter to :func:`getDocumentIDs`
function, which will give you list of :class:`DocumentID` named tuples.

Note:
    :py:func:`~collections.namedtuple` is used, because to access your
    document, you don't need just `document ID` number, but also `library ID`
    string.

Depending on your system, there may be just only one accessible library, or
multiple ones, and then you will be glad, that you get both of this
informations together.

:class:`DocumentID` can be used as parameter to :func:`downloadMARCXML`.

Lets look at some code::

    ids = getDocumentIDs(searchInAleph("nkc", "test", False, "wrd"))
    for id_num, library in ids:
        XML = downloadMARCXML(id_num, library)

        # processDocument(XML)

High-level
==========

XML wrappers
************
This wrappers returns full XML records from Aleph:

    - :func:`getISBNsXML`
    - :func:`getAuthorsBooksXML`
    - :func:`getPublishersBooksXML`
    - :func:`getBooksTitleXML`
    - :func:`getICZBooksXML`

ID wrappers
***********
There are wrappers, which returns ID's of matching document in Aleph:

    - :func:`getISBNsIDs`
    - :func:`getAuthorsBooksIDs`
    - :func:`getPublishersBooksIDs`
    - :func:`getBooksTitleIDs`
    - :func:`getICZBooksIDs`

You can theh download them using :func:`downloadMARCXML` or
:func:`downloadMARCOAI`.

Count wrappers
**************
Count wrappers returns just the number of records with given parameters are
there in aleph.

    - :func:`getISBNCount`
    - :func:`getAuthorsBooksCount`
    - :func:`getPublishersBooksCount`
    - :func:`getBooksTitleCount`
    - :func:`getICZBooksCount`

Note:
    Counting functions are by one request faster than just counting results
    from standard getters. It is preferred to use them to reduce load to Aleph.

Other noteworthy properties
===========================

List of valid bases can be obtained by calling :func:`getListOfBases`, which
returns list of strings.

There is also defined exception tree - see :class:`AlephException` doc-string
for details.
"""
from collections import namedtuple
from string import Template
from urllib import quote_plus

import dhtmlparser
from httpkie import Downloader

from settings import *


# Variables ===================================================================
# String.Template() variable convention is used
SEARCH_URL_TEMPLATE = "/X?op=find&request=$FIELD=$PHRASE&base=$BASE"
SET_URL_TEMPLATE = "/X?op=ill_get_set&set_number=$SET_NUMBER" + \
                   "&start_point=1&no_docs=$NUMBER_OF_DOCS"
DOC_URL_TEMPLATE = "/X?op=ill_get_doc&doc_number=$DOC_ID&library=$LIBRARY"
OAI_DOC_URL_TEMPLATE = "/X?op=find_doc&doc_num=$DOC_ID&base=$BASE"
RECORD_URL_TEMPLATE = "/X?op=present&set_number=$SET_NUM&set_entry=$RECORD_NUM"

MAX_RECORDS = 30


VALID_ALEPH_FIELDS = [
    "wrd",
    "wtl",
    "wau",
    "wkw",
    "txt",
    "wpb",
    "wpp",
    "wyr",
    "ssn",
    "sbn",
    "isn",
    "ob",
    "wpf",
    "wpv",
    "wln",
    "wlo",
    "wtp",
    "sg",
    "bar",
    "cnb",
    "icz",
    "sys",
    "wpk",
]
"""
- ``wrd`` - Všechny údaje [`All fields`]
- ``wtl`` - Název [`Title/name of the book`]
- ``wau`` - Autor (osoba, korporace) [`Author (person, corporation)`]
- ``wkw`` - Předmět (klíčová slova) [`Subject (keywords)`]
- ``txt`` - Slova z obsahu (table of cont.) [`Words from table of content`]
- ``wpb`` - Nakladatel [`Publisher`]
- ``wpp`` - Místo vydání [`Place of publication`]
- ``wyr`` - Rok vydání [`Year of publication`]
- ``ssn`` - ISSN
- ``sbn`` - ISBN / ISMN
- ``isn`` - ISBN / ISMN / ISSN
- ``ob``  - Obsazení (hudební díla) [`Cast (musical works)`]
- ``wpf`` - Periodicita [`Periodicity`]
- ``wpv`` - Kód země vydání [`Country code`]
- ``wln`` - Kód jazyka dokumentu [`Language code`]
- ``wlo`` - Kód jazyka originálu [`Lanugage code of original`]
- ``wtp`` - Druh dokumentu [`Type of document`]
- ``sg``  - Signatura [`Signature`]
- ``bar`` - Čárový kód [`Barcode`]
- ``cnb`` - Číslo národní bibl. [`Number of national bibl.`]
- ``icz`` - Identifikační číslo [`Identification number`]
- ``sys`` - Systémové číslo [`System number`]
- ``wpk``
"""

dhtmlparser.NONPAIR_TAGS = []  # used for parsing XML - see documentation


# Functions & objects =========================================================
[dokumentace]class AlephException(Exception): """ Exception tree:: - AlephException |- InvalidAlephBaseException |- InvalidAlephFieldException |- LibraryNotFoundException `- DocumentNotFoundException """ def __init__(self, message): Exception.__init__(self, message)
[dokumentace]class InvalidAlephBaseException(AlephException): def __init__(self, message): super(InvalidAlephBaseException, self).__init__(message)
[dokumentace]class InvalidAlephFieldException(AlephException): def __init__(self, message): super(InvalidAlephFieldException, self).__init__(message)
[dokumentace]class LibraryNotFoundException(AlephException): def __init__(self, message): super(LibraryNotFoundException, self).__init__(message)
[dokumentace]class DocumentNotFoundException(AlephException): def __init__(self, message): super(DocumentNotFoundException, self).__init__(message)
[dokumentace]class DocumentID(namedtuple("DocumentID", ["id", "library", "base"])): """ This structure is used to store `"pointer"` to document in aleph. Attributes: id (int): ID of document. library (str): This can be different for each document, depend on your system. base (str): Default "``nkc``", but really depends on what bases you have defined in your Aleph server. """ pass
[dokumentace]def getListOfBases(): """ This function is here mainly for purposes of unittest Returns: list of str: Valid bases as they are used as URL parameters in links at Aleph main page. """ downer = Downloader() data = downer.download(ALEPH_URL + "/F/?func=file&file_name=base-list") dom = dhtmlparser.parseString(data.lower()) # from default aleph page filter links containing local_base in their href base_links = filter( lambda x: "href" in x.params and "local_base" in x.params["href"], dom.find("a") ) # split links by & - we will need only XXX from link.tld/..&local_base=XXX base_links = map( lambda x: x.params["href"].replace("?", "&", 1).split("&"), base_links ) # filter only sections containing bases bases = map( lambda link: filter(lambda base: "local_base=" in base, link)[0], base_links ) # filter bases from base sections bases = map(lambda x: x.split("=")[1].strip(), bases) return list(set(bases)) # list(set()) is same as unique()
def _tryConvertToInt(s): """ Try convert value from `s` to int. Returns: int(s): If the value was successfully converted, or `s` when conversion failed. """ try: return int(s) except ValueError: return s def _alephResultToDict(dom): """ Convert part of non-nested XML to :py:class:`dict`. Args: dom (HTMLElement tree): pre-parsed XML (see dhtmlparser). Returns: dict: with python data """ result = {} for i in dom.childs: if not i.isOpeningTag(): continue keyword = i.getTagName().strip() value = _tryConvertToInt(i.getContent().strip()) # if there are multiple tags with same keyword, add values into # array, instead of rewriting existing value at given keyword if keyword in result: # if it is already there .. if isinstance(result[keyword], list): # and it is list .. result[keyword].append(value) # add it to list else: # or make it array result[keyword] = [result[keyword], value] else: # if it is not in result, add it result[keyword] = value return result
[dokumentace]def searchInAleph(base, phrase, considerSimilar, field): """ Send request to the aleph search engine. Request itself is pretty useless, but it can be later used as parameter for :func:`getDocumentIDs`, which can fetch records from Aleph. Args: base (str): which database you want to use phrase (str): what do you want to search considerSimilar (bool): fuzzy search, which is not working at all, so don't use it field (str): where you want to look (see: :attr:`VALID_ALEPH_FIELDS`) Returns: dictionary: consisting from following fields: | error (optional): present if there was some form of error | no_entries (int): number of entries that can be fetch from aleph | no_records (int): no idea what is this, but it is always >= than `no_entries` | set_number (int): important - something like ID of your request | session-id (str): used to count users for licensing purposes Example: Returned dict:: { 'session-id': 'YLI54HBQJESUTS678YYUNKEU4BNAUJDKA914GMF39J6K89VSCB', 'set_number': 36520, 'no_records': 1, 'no_entries': 1 } Raises: AlephException: if Aleph doesn't return any information InvalidAlephFieldException: if specified field is not valid """ downer = Downloader() if field.lower() not in VALID_ALEPH_FIELDS: raise InvalidAlephFieldException("Unknown field '" + field + "'!") param_url = Template(SEARCH_URL_TEMPLATE).substitute( PHRASE=quote_plus(phrase), # urlencode phrase BASE=base, FIELD=field, SIMILAR="Y" if considerSimilar else "N" ) result = downer.download(ALEPH_URL + param_url) dom = dhtmlparser.parseString(result) find = dom.find("find") # find <find> element :) if len(find) <= 0: raise AlephException("Aleph didn't returned any information.") find = find[0] # convert aleph result into dictionary result = _alephResultToDict(find) # add informations about base into result result["base"] = base if "error" not in result: return result # handle errors if result["error"] == "empty set": result["no_entries"] = 0 # empty set have 0 entries return result else: raise AlephException(result["error"])
[dokumentace]def downloadRecords(search_result, from_doc=1): """ Download `MAX_RECORDS` documents from `search_result` starting from `from_doc`. Attr: search_result (dict): returned from :func:`searchInAleph`. from_doc (int, default 1): Start from document number `from_doc`. Returns: list: List of XML strings with documents in MARC OAI. """ downer = Downloader() if "set_number" not in search_result: return [] # set numbers should be probably aligned to some length set_number = str(search_result["set_number"]) if len(set_number) < 6: set_number = (6 - len(set_number)) * "0" + set_number # download all no_records records = [] for cnt in range(search_result["no_records"]): doc_number = from_doc + cnt if cnt >= MAX_RECORDS or doc_number > search_result["no_records"]: break set_data = downer.download( ALEPH_URL + Template(RECORD_URL_TEMPLATE).substitute( SET_NUM=set_number, RECORD_NUM=doc_number, ) ) records.append(set_data) return records
[dokumentace]def getDocumentIDs(aleph_search_result, number_of_docs=-1): """ Get IDs, which can be used as parameters for other functions. Args: aleph_search_result (dict): returned from :func:`searchInAleph` number_of_docs (int, optional): how many :class:`DocumentID` from set given by `aleph_search_result` should be returned. Default -1 for all of them. Returns: list: :class:`DocumentID` named tuples to given `aleph_search_result`. Raises: AlephException: If Aleph returns unknown format of data. Note: Returned :class:`DocumentID` can be used as parameters to :func:`downloadMARCXML`. """ downer = Downloader() if "set_number" not in aleph_search_result: return [] # set numbers should be probably aligned to some length set_number = str(aleph_search_result["set_number"]) if len(set_number) < 6: set_number = (6 - len(set_number)) * "0" + set_number # limit number of fetched documents, if -1, download all if number_of_docs <= 0: number_of_docs = aleph_search_result["no_entries"] # download data about given set set_data = downer.download( ALEPH_URL + Template(SET_URL_TEMPLATE).substitute( SET_NUMBER=set_number, NUMBER_OF_DOCS=number_of_docs, ) ) # parse data dom = dhtmlparser.parseString(set_data) set_data = dom.find("ill-get-set") # there should be at least one <ill-get-set> field if len(set_data) <= 0: raise AlephException("Aleph didn't returned set data.") ids = [] for library in set_data: documents = _alephResultToDict(library) if "error" in documents: raise AlephException("getDocumentIDs: " + documents["error"]) # convert all document records to DocumentID named tuple and extend # them to 'ids' array if isinstance(documents["doc-number"], list): ids.extend( map( lambda x: DocumentID( x, documents["set-library"], aleph_search_result["base"] ), set(documents["doc-number"]) ) ) else: ids.append( DocumentID( documents["doc-number"], documents["set-library"], aleph_search_result["base"] ) ) return ids
[dokumentace]def downloadMARCXML(doc_id, library, base="nkc"): """ Download MARC XML document with given `doc_id` from given `library`. Args: doc_id (DocumentID): You will get this from :func:`getDocumentIDs`. library (str): "``NKC01``" in our case, but don't worry, :func:`getDocumentIDs` adds library specification into :class:`DocumentID` named tuple. Returns: str: MARC XML unicode string. Raises: LibraryNotFoundException DocumentNotFoundException """ downer = Downloader() data = downer.download( ALEPH_URL + Template(DOC_URL_TEMPLATE).substitute( DOC_ID=doc_id, LIBRARY=library ) ) dom = dhtmlparser.parseString(data) # check if there are any errors # bad library error error = dom.find("login") if error: error_msg = error[0].find("error") if error_msg: raise LibraryNotFoundException( "Can't download document doc_id: '" + str(doc_id) + "' " + "(probably bad library: '" + library + "')!\nMessage: " + "\n".join(map(lambda x: x.getContent(), error_msg)) ) # another error - document not found error = dom.find("ill-get-doc") if error: error_msg = error[0].find("error") if error_msg: raise DocumentNotFoundException( "\n".join(map(lambda x: x.getContent(), error_msg)) ) return data # MARCxml of document with given doc_id
[dokumentace]def downloadMARCOAI(doc_id, base): """ Download MARC OAI document with given `doc_id` from given (logical) `base`. Funny part is, that some documents can be obtained only with this function in their full text. Args: doc_id (str): You will get this from :func:`getDocumentIDs`. base (str, optional): Base from which you want to download Aleph document. This seems to be duplicite with :func:`searchInAleph` parameters, but it's just something Aleph's X-Services wants, so .. Returns: str: MARC XML Unicode string. Raises: InvalidAlephBaseException DocumentNotFoundException """ downer = Downloader() data = downer.download( ALEPH_URL + Template(OAI_DOC_URL_TEMPLATE).substitute( DOC_ID=doc_id, BASE=base ) ) dom = dhtmlparser.parseString(data) # check for errors error = dom.find("error") if len(error) <= 0: # no errors return data if "Error reading document" in error[0].getContent(): raise DocumentNotFoundException( str(error[0].getContent()) ) else: raise InvalidAlephBaseException( error[0].getContent() + "\n" + "The base you are trying to access probably doesn't exist." ) # High level API ==============================================================
[dokumentace]def getISBNsXML(isbn, base=ALEPH_DEFAULT_BASE): """ Download full XML record for given `isbn` in `base`. Args: isbn (str): ISBN of the books you want to get. base (str): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: List of strings with full **OAI** XML representation of the \ record. """ return downloadRecords( searchInAleph( base, isbn, False, "sbn" ) )
[dokumentace]def getISSNsXML(issn, base=ALEPH_DEFAULT_BASE): """ Download full XML record for given `issn` in `base`. Args: issn (str): ISSN of the books you want to get. base (str): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: List of strings with full **OAI** XML representation of the \ record. """ return downloadRecords( searchInAleph( base, issn, False, "ssn" ) )
[dokumentace]def getAuthorsBooksXML(author, base=ALEPH_DEFAULT_BASE): """ Download full XML record for given `author` in `base`. Args: author (str): Name of the `author` of the books you want to get. base (str): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: List of strings with full **OAI** XML representation of the \ record. """ return downloadRecords( searchInAleph( base, author, False, "wau" ) )
[dokumentace]def getPublishersBooksXML(publisher, base=ALEPH_DEFAULT_BASE): """ Download full XML record for given `publisher` in `base`. Args: publisher (str): Name of the `publisher` of the books you want to get. base (str): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: List of strings with full **OAI** XML representation of the \ record. """ return downloadRecords( searchInAleph( base, publisher, False, "wpb" ) )
[dokumentace]def getBooksTitleXML(title, base=ALEPH_DEFAULT_BASE): """ Download full XML record for given `title` in `base`. Args: title (str): `title` of the books you want to get. base (str): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: List of strings with full **OAI** XML representation of the \ record. """ return downloadRecords( searchInAleph( base, title, False, "wtl" ) )
[dokumentace]def getICZBooksXML(icz, base=ALEPH_DEFAULT_BASE): """ Download full XML record for given `icz` (identification number) in `base`. Args: icz (str): Identification number used to search Aleph. base (str): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: List of strings with full **OAI** XML representation of the \ record. """ return downloadRecords( searchInAleph( base, icz, False, "icz" ) ) # ID getters ==================================================================
[dokumentace]def getISBNsIDs(isbn, base=ALEPH_DEFAULT_BASE): """ Get list of :class:`DocumentID` objects of documents with given `isbn`. Args: isbn (str): ISBN string. base (str, optional): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: of :class:`DocumentID` objects """ return getDocumentIDs(searchInAleph(base, isbn, False, "sbn"))
[dokumentace]def getAuthorsBooksIDs(author, base=ALEPH_DEFAULT_BASE): """ Get list of :class:`DocumentID` objects of documents with given `author`. Args: author (str): Authors name/lastname in UTF-8. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: of :class:`DocumentID` objects """ return getDocumentIDs(searchInAleph(base, author, False, "wau"))
[dokumentace]def getPublishersBooksIDs(publisher, base=ALEPH_DEFAULT_BASE): """ Get list of :class:`DocumentID` objects of documents with given `publisher`. Args: publisher (str): Name of publisher which will be used to search Aleph. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: of :class:`DocumentID` objects """ return getDocumentIDs(searchInAleph(base, publisher, False, "wpb"))
[dokumentace]def getBooksTitleIDs(title, base=ALEPH_DEFAULT_BASE): """ Get list of :class:`DocumentID` objects of documents with given `title`. Args: title (str): Title (name) of the book which will be used to search in Aleph. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: of :class:`DocumentID` objects """ return getDocumentIDs(searchInAleph(base, title, False, "wtl"))
[dokumentace]def getICZBooksIDs(icz, base=ALEPH_DEFAULT_BASE): """ Get list of :class:`DocumentID` objects of documents with given `icz` (identification number). Args: icz (str): Identification number used to search Aleph. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: list: of :class:`DocumentID` objects """ return getDocumentIDs(searchInAleph(base, icz, False, "icz")) # Counters ====================================================================
[dokumentace]def getISBNCount(isbn, base=ALEPH_DEFAULT_BASE): """ Get number of records in Aleph which match given `isbn`. Args: isbn (str): ISBN string. base (str, optional): Base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: int: Number of matching documents in Aleph. """ return searchInAleph(base, isbn, False, "sbn")["no_entries"]
[dokumentace]def getAuthorsBooksCount(author, base=ALEPH_DEFAULT_BASE): """ Get number of records in Aleph which match given `author`. Args: author (str): Authors name/lastname in UTF-8. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: int: Number of matching documents in Aleph. """ return searchInAleph(base, author, False, "wau")["no_entries"]
[dokumentace]def getPublishersBooksCount(publisher, base=ALEPH_DEFAULT_BASE): """ Get number of records in Aleph which match given `publisher`. Args: publisher (str): Name of publisher which will be used to search Aleph. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: int: Number of matching documents in Aleph. """ return searchInAleph(base, publisher, False, "wpb")["no_entries"]
[dokumentace]def getBooksTitleCount(title, base=ALEPH_DEFAULT_BASE): """ Get number of records in Aleph which match given `title`. Args: title (str): Title (name) of book which will be used to search Aleph. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: int: Number of matching documents in Aleph. """ return searchInAleph(base, title, False, "wtl")["no_entries"]
[dokumentace]def getICZBooksCount(icz, base=ALEPH_DEFAULT_BASE): """ Get number of records in Aleph which match given `title`. Args: icz (str): Identification number used to search Aleph. base (str, optional): base on which will be search performed. Default :attr:`aleph.settings.ALEPH_DEFAULT_BASE`. Returns: int: Number of matching documents in Aleph. """ return searchInAleph(base, icz, False, "icz")["no_entries"]