#!/usr/bin/perl # Project: Web Reference Database (refbase) # Copyright: Matthias Steffens and the file's # original author(s). # # This code is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY. Please see the GNU General Public # License for more details. # # File: ./contrib/command_line/refbase # Repository: $HeadURL: file:///svn/p/refbase/code/branches/bleeding-edge/contrib/command_line/refbase $ # Author(s): Matthias Steffens # # Created: 06-Jun-06, 18:00 # Modified: $Date: 2008-11-13 21:08:22 +0000 (Thu, 13 Nov 2008) $ # $Author: msteffens $ # $Revision: 1315 $ # REFBASE -- a refbase command line interface # Purpose: Perl script that allows to search a refbase online database from the command line and retrieve results in various formats # Usage: refbase [OPTIONS] # Help: For help with the syntax type 'refbase -h' # To view some usage examples type 'refbase -X' # Further information is available at # Version: 1.2.1 # Requires: - a shell with Perl execution capabilities # - the Perl CPAN modules LWP::UserAgent, HTTP::Request::Common, HTTP::Response, HTTP::Cookies and URI::URL # - access to a refbase database (refbase-0.9.0 or greater, refbase-0.9.5 required for '-A|--append' and '-B|--update' mode) # Limits: - Currently, this utility supports search & retrieve, but does not support update actions such as add, edit or delete. # - This script is currently just an interface to 'show.php', which for example does not support arbitrary sort orders. # - Specifying the record offset (using '-S|--start') as well as the number of records to be returned (using '-R|--rows') will # only work for the formats 'html', 'rtf', 'pdf', 'latex', 'latex_bbl', 'markdown', 'ascii', 'atom', 'srw_dc' and 'srw_mods', # since the other formats are designed to always export the entire result set. Note that for 'html', '--start' is adjusted to # the next lower value that is an exact multiple of '--rows' (which ensures correct behaviour of the browse links). # - The authentication mechanism is currently limited in that a given password will be transferred as parameter in the POST request # - The '-A|--append' and '-B|--update' modes currently only work with formats 'bibtex', 'mods' and 'srw_mods' # -------------------------------------------------------------------------------------------------------------- $version = "1.2.1"; # Configure variables: # Specify the full URLs to any refbase servers that shall be queried: # Notes: - the given hash keys will work as shortcuts, e.g. '--host=local' would query # your local refbase installation; one hash key must be named 'default', # all other keys can be freely chosen # - by default, the server labeled with key 'default' will be queried %hosts = ( 'default' => 'http://beta.refbase.net/', 'local' => 'http://localhost/refs/', 'beta' => 'http://beta.refbase.net/', 'beta2' => 'http://refbase.textdriven.com/beta/', 'demo' => 'http://demo.refbase.net/', 'org' => 'http://www.refbase.org/' ); # Specify the default values for all options that are not explicitly specified: %params = ( # query options: 'author' => '', # -a|--author 'abstract' => '', # -b|--abstract 'cite_key' => '', # -c|--citekey => requires '--userid' 'date' => '', # -d|--date 'area' => '', # -e|--area 'thesis' => '', # -f|--thesis 'contribution_id' => '', # -i|--contribid 'abbrev_journal' => '', # -j|--journal 'keywords' => '', # -k|--keywords 'location' => '', # -l|--location 'ismarked' => '', # -m|--marked => requires '--userid' 'notes' => '', # -n|--notes 'publication' => '', # -p|--publication 'queryType' => 'and', # -q|--query => multiple options will by default be connected with 'AND' 'records' => '', # -r|--records 'selected' => '', # -s|--selected => requires '--userid' 'title' => '', # -t|--title 'userID' => '', # -u|--userid => the user ID of your account at the refbase database you're querying 'where' => '', # -w|--where 'type' => '', # -x|--type 'year' => '', # -y|--year 'serial' => '.+', # -z|--serial => the default '.+' causes all database records to be returned if only empty params are given (normally, you should leave this default as is) # output options: 'appendFile' => '', # -A|--append => file to which search results will be appended 'updateRecords' => '0', # -B|--update => update existing records in '-A, --append' file; must be '0' (=no) or '1' (=yes) 'citeStyle' => '', # -C|--style => desired citation style, given name must match an entry within the database's MySQL table 'styles' (keep empty to use the database default) 'extractFile' => '', # -E|--extract => file from which citations will be extracted 'format' => 'ascii', # -F|--format => output format must be 'html', 'rtf', 'pdf', 'latex', 'latex_bbl', 'markdown', 'ascii', 'ads', 'bibtex', 'endnote', 'isi', 'ris', 'atom', 'mods', 'oai_dc', 'odf', 'srw_dc', 'srw_mods', 'word' or '' (the empty string '' will produce the default 'ascii' output style) 'showLinks' => '1', # -L|--showlinks => hide/display links column in HTML output; must be '0', '1', or '' (the empty string '' will produce the default output style, i.e. print any links) 'citeOrder' => 'author', # -O|--order => cite order must be 'author', 'year', 'type', 'type-year', 'creation-date' or '' (the empty string '' will produce the default 'author' sort order) 'showQuery' => '0', # -Q|--showquery => hide/display SQL query in ASCII output; must be '0', '1', or '' (the empty string '' will produce the default output style, i.e. not showing the SQL query) 'showRows' => '', # -R|--rows => desired number of search results (keep empty to use the database default) 'startRecord' => '1', # -S|--start => offset of the first search result, starting with one 'viewType' => 'web', # -V|--view => view type of HTML output; must be 'Web', 'Print', 'Mobile' or '' (the empty string '' will produce the default 'Web' output style) # fixed parameters: 'submit' => 'Cite', # display type for HTML output; must be 'Display', 'Cite', 'Export', or '' (the empty string '' will produce the default 'columnar' output style); this param's value will get adopted below based on the chosen '--format' 'client' => "cli-refbase-" . $version # the client ID of this command line utility ); # Specify the default login credentials for a refbase user account: %loginParams = ( 'loginEmail' => '', # -U|--user -> the login email address of an existing refbase user 'loginPassword' => '' # -P|--password -> the password for the given user account ); # Specify the location of the cookie jar file: # This file will be used to store & retrieve cookies $cookieJarFile = "$ENV{HOME}/.lwpcookies.txt"; # For '-A|--append' mode, specify whether all records in the given file # shall be sorted by cite key after any records have been appended/updated: $sortAppendFileData = 1; # must be '1' (=yes) or '0' (=no) # For '-A|--append' mode, specify whether all appended/updated # records shall be reported as citations to STDOUT: $reportResults = 1; # must be '1' (=yes) or '0' (=no) # For '-A|--append' mode, specify whether a backup file shall be created # before anything gets updated or appended to the given file: $backupAppendFile = 1; # must be '1' (=yes) or '0' (=no) # By default, the backup file uses the given file name with a tilde character # added at the end of the file name. Adopt to your needs if necessary: $backupFileNameSuffix = "~"; # For '-E|--extract' mode, specify regular expression patterns that match the # citation IDs (i.e. refbase serial numbers or cite keys from the 'cite_key' # field) in the given file: # For each recognized file name extension[*], there must be a triplet of code # (as shown below), each hash element contains an array with three elements: # 1) a regex pattern that matches the cite IDs in the file (e.g. '\\\\cite\{(.+?)\}') # (note that the regex patterns will be applied using the 'msg' mode modifiers) # 2) the number of the sub-pattern that captures the cite IDs (e.g. '1') # 3) a regex split pattern that matches the delimiter(s) used between multiple cite IDs (e.g. '[, ]+') # [*]: The hash key must match the file's file name extension. If the given file is # of unknown file type the 'default' pattern will be applied. %citeIDPatterns = ( # - LaTeX, .tex file: 'tex' => [ '\\\\(?:(?:no)?cite|cite(?:n|num|online)|cite(?:al)?[tp]\*?|cite(?:author\*?|year(?:par)?|text|[tp]alias))(?:\[.*?\])*\{(.+?)\}', # 1), matches e.g.: '\cite{...}', '\nocite{...}', '\cite[...]{...}' and cite commands from the 'cite' & 'natbib' packages '1', # 2) '[, ]+' ], # 3) # - LaTeX, .aux file: 'aux' => [ '\\\\(?:citation|bibcite)\{(.+?)\}', # matches '\citation{...}' or '\bibcite{...}' '1', '[, ]+' ], # - LaTeX, .bib file: 'bib' => [ '^[\t ]*@[A-Za-z]+\{(.+?),', # matches '@Article{...,' etc '1', '[, ]+' ], # - LaTeX, .bbl file: 'bbl' => [ '\\\\bibitem(?s:\[.*?\])*\{(.+?)\}', # matches '\bibitem{...}' or '\bibitem[...]{...}' '1', '[, ]+' ], # - MODS, SRW_MODS, or Endnote XML file: # TODO: add support for OAI_DC, SRW_DC and refbase OpenSearch Atom XML (citekey:...) 'xml' => [ '(?:|<(?:label|accession-num)>\s*\s*)(.+?)(?:|\s*\s*|")', # matches MODS IDs like '...' etc -OR- Endnote XML IDs, e.g. '' etc '1', '[, ]+' ], # - Endnote tagged text, .enw file: 'enw' => [ '^%F ([^\n\r]+)$', # matches '%F ...' '1', '[, ]+' ], # - RIS, .ris file: 'ris' => [ '^ID - ([^\n\r]+)$', # matches 'ID - ...' '1', '[, ]+' ], # - Generic, extracts IDs from lists of comma-separated refbase serial numbers (or cite keys) that are enclosed by braces: 'default' => [ '\{([^\n\r]+?)\}', # matches e.g.: '{123}', '{1,12,33}', '{Arrigo+Thomas2004}' or '{Arrigo+Thomas2004,Assur1958}' '1', '[,]+' ], ); # -------------------------------------------------------------------------------- # NOTE: You shouldn't need to change anything below this line # CPAN modules: use LWP::UserAgent; # more info: use HTTP::Request::Common; # more info: use HTTP::Response; # more info: use HTTP::Cookies; # more info: use URI::URL; # more info: # standard modules: use Time::Local; # initialize variables: $host = $hosts{'default'}; $optCt = 0; $format = ''; @appendFileSerials = (); $updateRecords = 0; @extractFileSerials = (); @extractFileKeys = (); %months = ( 'Jan' => 0, 'Feb' => 1, 'Mar' => 2, 'Apr' => 3, 'May' => 4, 'Jun' => 5, 'Jul' => 6, 'Aug' => 7, 'Sep' => 8, 'Oct' => 9, 'Nov' => 10, 'Dec' => 11 ); # Extract options: # TODO: use Getopt::Long # general options: if (($ARGV[0] eq '--help') or ($ARGV[0] eq '-h') or ($ARGV[0] eq '')) { &usage (0); } # if the user asked for --help/-h or didn't provide any input, call the 'usage' subroutine elsif (($ARGV[0] eq '--version') or ($ARGV[0] eq '-v')) { &version (0); } # show version information elsif (($ARGV[0] eq '--examples') or ($ARGV[0] eq '-X')) { &examples (0); } # print some usage examples else { foreach (@ARGV) { # extract query options: if ($_ =~ /^(?:-a|--author)=(.+)$/) { $params{'author'} = $1; $optCt++; } elsif ($_ =~ /^(?:-b|--abstract)=(.+)$/) { $params{'abstract'} = $1; $optCt++; } elsif ($_ =~ /^(?:-c|--citekey)=(.+)$/) { $params{'cite_key'} = $1; $optCt++; } elsif ($_ =~ /^(?:-d|--date)=(.+)$/) { $params{'date'} = $1; $optCt++; } elsif ($_ =~ /^(?:-e|--area)=(.+)$/) { $params{'area'} = $1; $optCt++; } elsif ($_ =~ /^(?:-f|--thesis)=(.+)$/) { $params{'thesis'} = $1; $optCt++; } elsif ($_ =~ /^(?:-i|--contribid)=(.+)$/) { $params{'contribution_id'} = $1; $optCt++; } elsif ($_ =~ /^(?:-j|--journal)=(.+)$/) { $params{'abbrev_journal'} = $1; $optCt++; } elsif ($_ =~ /^(?:-k|--keywords)=(.+)$/) { $params{'keywords'} = $1; $optCt++; } elsif ($_ =~ /^(?:-l|--location)=(.+)$/) { $params{'location'} = $1; $optCt++; } elsif ($_ =~ /^(?:-m|--marked)=(.+)$/) { $params{'ismarked'} = $1; $optCt++; } elsif ($_ =~ /^(?:-n|--notes)=(.+)$/) { $params{'notes'} = $1; $optCt++; } elsif ($_ =~ /^(?:-p|--publication)=(.+)$/) { $params{'publication'} = $1; $optCt++; } elsif ($_ =~ /^(?:-q|--query)=(.+)$/) { $params{'queryType'} = $1; } elsif ($_ =~ /^(?:-r|--records)=(.+)$/) { $params{'records'} = $1; $optCt++; } elsif ($_ =~ /^(?:-s|--selected)=(.+)$/) { $params{'selected'} = $1; $optCt++; } elsif ($_ =~ /^(?:-t|--title)=(.+)$/) { $params{'title'} = $1; $optCt++; } elsif ($_ =~ /^(?:-u|--userid)=(.+)$/) { $params{'userID'} = $1; $optCt++; } elsif ($_ =~ /^(?:-w|--where)=(.+)$/) { $params{'where'} = $1; $optCt++; } elsif ($_ =~ /^(?:-x|--type)=(.+)$/) { $params{'type'} = $1; $optCt++; } elsif ($_ =~ /^(?:-y|--year)=(.+)$/) { $params{'year'} = $1; $optCt++; } elsif ($_ =~ /^(?:-z|--serial)=(.+)$/) { $params{'serial'} = $1; } # extract output options: elsif ($_ =~ /^(?:-A|--append)=(.+)$/) { $params{'appendFile'} = $1; } elsif ($_ =~ /^(?:-B|--update)=(.+)$/) { $params{'updateRecords'} = $1; } elsif ($_ =~ /^(?:-C|--style)=(.+)$/) { $params{'citeStyle'} = $1; } elsif ($_ =~ /^(?:-E|--extract)=(.+)$/) { $params{'extractFile'} = $1; } elsif ($_ =~ /^(?:-F|--format)=(.+)$/) { $params{'format'} = $1; } elsif ($_ =~ /^(?:-L|--showlinks)=(.+)$/) { $params{'showLinks'} = $1; } elsif ($_ =~ /^(?:-O|--order)=(.+)$/) { $params{'citeOrder'} = $1; } elsif ($_ =~ /^(?:-Q|--showquery)=(.+)$/) { $params{'showQuery'} = $1; } elsif ($_ =~ /^(?:-R|--rows)=(.+)$/) { $params{'showRows'} = $1; } elsif ($_ =~ /^(?:-S|--start)=(.+)$/) { $params{'startRecord'} = $1; } elsif ($_ =~ /^(?:-V|--view)=(.+)$/) { $params{'viewType'} = $1; } # extract server options: elsif ($_ =~ /^(?:-H|--host)=(.+)$/) { $host = $1; } elsif ($_ =~ /^(?:-P|--password)=(.+)$/) { $loginParams{'loginPassword'} = $1; } elsif ($_ =~ /^(?:-U|--user)=(.+)$/) { $loginParams{'loginEmail'} = $1; } } } # resolve any host shortcuts: if (exists($hosts{$host})) { $host = $hosts{$host}; } elsif ($host !~ /^https?:\/\//i) { $host = $hosts{'default'}; # can't resolve given host, reset back to default } # if any query option other than the 'serial' parameter was explicitly set, # remove any default '.+' value from the 'serial' parameter: # (otherwise an 'OR' query would always match everything) if (($optCt > 0) && ($params{'serial'} eq '.+')) { # if '--citekey', '--selected' or '--marked' is given, '--userid' must be specified as well; i.e., # in case of these user-specific params, we'll only empty the 'serial' param if a user ID is present if (($params{'cite_key'} eq '') && ($params{'selected'} eq '') && ($params{'ismarked'} eq '')) { $params{'serial'} = ''; } elsif ($params{'userID'} ne '') { # at least one of '--citekey', '--selected' or '--marked' was given together with a '--userid' $params{'serial'} = ''; } } # assign correct URL params based on the '-F|--format' option: if (exists($params{'format'})) { $format = $params{'format'}; if ($format =~ /^(rtf|pdf|latex|latex_bbl|markdown|ascii)$/i) { $params{'submit'} = "Cite"; } if ($format =~ /^(html|rtf|pdf|latex|latex_bbl|markdown|ascii)$/i) { $format =~ s/^latex_bbl$/LaTeX .bbl/i; $params{'citeType'} = $format; } elsif ($format =~ /^(ads|bibtex|endnote|isi|ris|atom|mods|oai_dc|odf|srw(_dc|_mods)?|word)$/i) { $params{'submit'} = "Export"; $params{'exportType'} = "file"; if ($format =~ /^(ads|bibtex|endnote|isi|ris)$/i) { $params{'exportFormat'} = $format; } elsif ($format =~ /^(atom|mods|oai_dc|odf|srw(_dc|_mods)?|word)$/i) { $params{'exportFormat'} = $format . " xml"; } } else { $params{'citeType'} = "ascii"; } delete($params{'format'}); } # '-E|--extract' functionality: if ($params{'extractFile'} ne '') { $extractFile = $params{'extractFile'}; # remove 'extractFile' parameter (which we don't need to send to the refbase server): delete($params{'extractFile'}); # check if the given file exists and is readable: if (!(-r $extractFile)) { print "The '-E|--extract' option requires a name/path to an existing file.\n\n"; exit; } else { # the given '$extractFile' exists and is readable $fileExtension = "default"; # triggers default regex patterns for extraction of cite IDs # extract any file name extension: if ($extractFile =~ /\.([^.\n]+)$/) { if (exists($citeIDPatterns{$1})) { # if '$extractFile' has a recognized file name extension $fileExtension = $1; # use file-specific regex patterns for extraction of cite IDs } } $citeIDRegex = @{$citeIDPatterns{$fileExtension}}[0]; # get regex pattern that matches the cite IDs in '$extractFile' $citeIDNum = @{$citeIDPatterns{$fileExtension}}[1]; # get number of the sub-pattern that captures the cite IDs $citeIDSplit = @{$citeIDPatterns{$fileExtension}}[2]; # get regex split pattern that matches the delimiter(s) used between multiple cite IDs # open '$extractFile' in read mode: open(KEYS, "<", $extractFile) || die "Can't open file '" . $extractFile . "': $!\n"; # read the entire file at once: undef $/; $extractFileString = ; # close '$extractFile': close(KEYS) || die "Can't close file: $!\n"; # extract all refbase serial numbers or cite keys that exist in '$extractFile': while ($extractFileString =~ /$citeIDRegex/msg) { $citeID = $$citeIDNum; if ($citeID =~ /^(?:\d+|$citeIDSplit)+$/) { # '$citeID' is assumed to be a refbase serial number (or a list of multiple serials) while ($citeID =~ /(\d+)/g) { push(@extractFileSerials, $1); } } else { # '$citeID' is assumed to be a cite key (or a list of multiple cite keys) push(@extractFileKeys, split(/$citeIDSplit/, $citeID)); } } # remove any duplicate cite IDs: @extractFileSerials = &uniquify(@extractFileSerials); @extractFileKeys = &uniquify(@extractFileKeys); # add query clause to restrict search results to records existing in '$extractFile': if (@extractFileSerials || @extractFileKeys) { if ($params{'where'} ne '') { $params{'where'} .= " AND "; } $params{'where'} .= "("; if (@extractFileSerials) { $params{'where'} .= 'serial RLIKE "^(' . join('|', @extractFileSerials) . ')$"'; } if (@extractFileKeys) { if (@extractFileSerials) { $params{'where'} .= " OR "; } $params{'where'} .= 'cite_key RLIKE "^(' . join('|', map {quotemeta(quotemeta($_))} @extractFileKeys) . ')$"'; # quotes each key before joining them } $params{'where'} .= ")"; } else { # no citation IDs could be extracted from '$extractFile' print "No citation IDs were found in file '" . $extractFile . "'!\n\n"; exit; } } } # '-A|--append' mode: if ($params{'appendFile'} ne '') { # '-A|--append' mode currently only works with formats 'bibtex', 'mods' and 'srw_mods', # for all other formats (or if no format was specified), we'll return an error message: if ($format !~ /^(bibtex|mods|srw(_mods)?)$/i) { print "The '-A, --append' option requires '-F, --format' set to 'bibtex', 'mods' or 'srw_mods'.\n\n"; exit; } $appendFile = $params{'appendFile'}; # remove 'appendFile' parameter (which we don't need to send to the refbase server): delete($params{'appendFile'}); $appendFileString = ''; # if the given '$appendFile' exists and is readable: if (-r $appendFile) { # open '$appendFile' in read mode: open(FILEIN, "<", $appendFile) || die "Can't open file '" . $appendFile . "': $!\n"; # read the entire file at once: undef $/; $appendFileString = ; # extract all refbase serial numbers (which already exist in '$appendFile') into an array: while ($appendFileString =~ /(?<=show\.php\?record=)(\d+)/g) { push(@appendFileSerials, $1); } # close '$appendFile': close(FILEIN) || die "Can't close file: $!\n"; # backup existing contents of '$appendFile': # (note that, currently, this will overwrite contents of any existing backup file # with the same backup file name, even if nothing gets updated/appended further down) if ($backupAppendFile) { # create suitable backup file name: if ($appendFile =~ /\.([^.\n]+)$/) { # if the file has a file name extension ($appendFileBackup = $appendFile) =~ s/\.([^.\n]+)$/$backupFileNameSuffix.$1/; # add backup suffix in front of the file name extension } else { $appendFileBackup = $appendFile . $backupFileNameSuffix; # add backup suffix at end of file } # open backup file in write mode (creates new file if it doesn't exist): open(FILEBACKUP, ">", $appendFileBackup) || die "Can't open file '" . $appendFileBackup . "': $!\n"; # write existing contents of '$appendFile' to backup file (replacing any previous contents): print FILEBACKUP $appendFileString; # close backup file: close(FILEBACKUP) || die "Can't close file: $!\n"; } # add query clause to exclude existing records from search results: if (@appendFileSerials) { # save current contents of the '-w|--where' option before messing with it further: # (it will be needed in the 'append' and 'update' subroutines below) $where = $params{'where'}; if ($params{'where'} ne '') { $params{'where'} .= " AND "; } $params{'where'} .= 'serial NOT RLIKE "^(' . join('|', @appendFileSerials) . ')$"'; } } # if no '$appendFile' exists, it will be created by the 'append' subroutine below } # '-B|--update' functionality: if ($params{'updateRecords'} == 1) { $updateRecords = 1; } # remove 'updateRecords' parameter (which we don't need to send to the refbase server): delete($params{'updateRecords'}); # for HTML output, we'll adjust the display type if the '-r|--records' option contains a single record serial number: if (($params{'citeType'} eq 'html') && ($params{'records'} =~ /^[0-9]+$/)) { $params{'submit'} = "Display"; } # NOTE: I tried to put all query-related code into a dedicated function but for # some reason that didn't work with redirects. ?:-/ # initialize new user agent: # (uses LWP::UserAgent) $userAgent = LWP::UserAgent->new; # set user agent string: $userAgent->agent("refbase/" . $version . " (http://cli.refbase.net/) "); # allow redirection for 'POST' requests: # (by default, the list of request names that '$userAgent->redirect_ok(...)' # will allow redirection for is only set to ['GET', 'HEAD'], as per RFC 2616) push @{ $userAgent->requests_redirectable }, 'POST'; # set cookie jar object: # LWP will collect cookies and respond to cookie requests via its cookie jar, thus # enabling the user agent to fetch a PHP session ID from the refbase login response # and automatically resend it upon next request $userAgent->cookie_jar({ file => $cookieJarFile, autosave => 1 }); # attempt to authenticate using the given login credentials: if (($loginParams{'loginEmail'} ne '') && ($loginParams{'loginPassword'} ne '')) { $loginSuccessful = &login(0); # call the 'login' subroutine } # construct URL: # (uses URI::URL) $script = "show.php"; $url = url($host . $script); # build and send GET/POST request: # (uses HTTP::Request::Common & HTTP::Response) # build POST request using the 'application/x-www-form-urlencoded' content type: $request = POST $url, \%params; # or, build POST request using the 'multipart/form-data' content type: # $request = POST $url, Content_Type => 'form-data', Content => \%params; # alternatively, build GET request: # (note that for large URLs, a GET request may cause an "414 Request-URI Too Large" error) # $url->query_form(%params); # $request = GET $url; # print $request->as_string(); # DEBUG (dumps the GET/POST request) # send the GET/POST request: $response = $userAgent->request($request); # or use: $response = $userAgent->get($url); # print returned results: if ($response->is_error()) { # if the request fails, print error message to STDERR: print STDERR $response->status_line, "\n"; } elsif (defined($appendFile)) { # if an '$appendFile' was given print results to FILE: if ($updateRecords == 1 && @appendFileSerials) { &update(0); # update existing records in '$appendFile' with found results } $resultsString = $response->content(); # TODO: it would be better to use function 'splitRecs' to check whether there # were any results returned for the given format if ($resultsString =~ /^Nothing found!\n+/i) { $resultsString = ''; } if (($resultsString eq '') || (($format =~ /^(mods|srw(_mods)?)$/i) && ($resultsString !~ /]/i))) { print "There are no records that need to be added to file '" . $appendFile . "'.\n\n"; } else { # if the query returned results which aren't yet in '$appendFile': &append(0); # append search results to file } } else { # print results to STDOUT: binmode STDOUT; print $response->content(); } # -------------------------------------------------------------------------------- # Login with login credentials given in '%loginParams': sub login { local ($status) = @_; # construct URL: # (uses URI::URL) $loginScript = "user_login.php"; $loginURL = url($host . $loginScript); # send POST request: # (uses HTTP::Request::Common & HTTP::Response) $loginRequest = POST $loginURL, \%loginParams; $loginResponse = $userAgent->request($loginRequest); if ($loginResponse->is_error()) { print STDERR $loginResponse->status_line, "\n"; exit $status; } else { $location = $loginResponse->header('Location'); # upon successful login, refbase will redirect to 'index.php' if ($location =~ /index.php/) { return 1; # login successful } else { return 0; # login NOT successful } } } # -------------------------------------------------------------------------------- # Append search results to file: sub append { local ($status) = @_; local @newAppendFileData = (); $whereAppend = ""; # TODO: In order to enable sorting of all records for MODS + SRW, we should remove the # XML file header & footer from the results (and store them in a variable). Then # merge existing & new records, split, sort, join again, and finally put the XML # header & footer back. if ($format =~ /^(mods|srw(_mods)?)$/i) { # if the '$appendFile' contains MODS XML data: if (($format =~ /^mods$/i) && ($appendFileString =~ /' opening tag from results: $resultsString =~ s/^<\?xml.+?\n]*?>\n//ims; # remove '' closing tag from existing records in '$appendFile': $appendFileString =~ s/^<\/modsCollection>//ims; } # if the '$appendFile' contains SRW_MODS XML data: elsif (($format =~ /^srw(_mods)?$/i) && ($appendFileString =~ /\n]*?>\n//ims; # remove closing tags from existing records in '$appendFile': $appendFileString =~ s/^\s*<\/srw:records>.*?<\/srw:searchRetrieveResponse>//ims; # TODO: update values in '' and '' } } # append search results to the '$appendFile' contents: $newAppendFileString = $appendFileString; if (!($sortAppendFileData) && ($format =~ /^bibtex$/i) && ($appendFileString ne '') && ($resultsString ne '')) { $newAppendFileString .= "\n"; } $newAppendFileString .= $resultsString; # sort all records by cite key: if ($sortAppendFileData) { # remove any newlines from end of string: $newAppendFileString =~ s/[\n\r]+$//; # split '$appendFile' contents & search results on (format-specific) record delimiters: @newAppendFileData = &splitRecs($newAppendFileString); # sort array of records by cite key: @newAppendFileData = &sortRecs(@newAppendFileData); # merge again records into a string: if ($format =~ /^bibtex$/i) { $recDelim = "\n\n"; } else { $recDelim = "\n"; } $newAppendFileString = join($recDelim, @newAppendFileData) . "\n"; } # open '$appendFile' in write mode (creates new file if it doesn't exist): open(FILEADD, ">", $appendFile) || die "Can't open file '" . $appendFile . "': $!\n"; # write existing contents & new results back to '$appendFile': print FILEADD $newAppendFileString; # close '$appendFile': close(FILEADD) || die "Can't close file: $!\n"; # to give some feedback, we output all appended records as citations to STDOUT: # TODO: move into a dedicated function if possible (compare with 'update' subroutine) if ($reportResults) { $params{'submit'} = "Cite"; $params{'citeType'} = "ascii"; $params{'showRows'} = "99999"; $params{'headerMsg'} = "Added records:"; if ($where ne '') { $whereAppend = $where . " AND "; } if (($appendFileString ne '') && (@appendFileSerials)) { # '$appendFile' contains some records # add query clause to exclude existing records from search results: $whereAppend .= 'serial NOT RLIKE "^(' . join('|', @appendFileSerials) . ')$"'; } else { # '$appendFile' didn't exist or was empty # extract all refbase serial numbers from '$resultsString' into an array: while ($resultsString =~ /(?<=show\.php\?record=)(\d+)/g) { push(@appendFileSerials, $1); } # add query clause to display all records from the current search results: $whereAppend .= 'serial RLIKE "^(' . join('|', @appendFileSerials) . ')$"'; } $params{'where'} = $whereAppend; # construct URL: # (uses URI::URL) $appendFeedbackURL = url($host . $script); # send POST request: # (uses HTTP::Request::Common & HTTP::Response) $appendFeedbackRequest = POST $appendFeedbackURL, \%params; $appendFeedbackResponse = $userAgent->request($appendFeedbackRequest); print $appendFeedbackResponse->content(); } } # -------------------------------------------------------------------------------- # Update existing records (given in '$appendFile') with found results: sub update { local ($status) = @_; local @appendFileData = (); local @newAppendFileData = (); @updatedSerials = (); $whereUpdate = ""; # add query clause to restrict search results to records existing in '$appendFile': if ($where ne '') { $whereUpdate = $where . " AND "; } $whereUpdate .= 'serial RLIKE "^(' . join('|', @appendFileSerials) . ')$"'; $params{'where'} = $whereUpdate; # fetch all records that match the given query AND which exist in '$appendFile': # construct URL: # (uses URI::URL) $updateURL = url($host . $script); # send POST request: # (uses HTTP::Request::Common & HTTP::Response) $updateRequest = POST $updateURL, \%params; $updateResponse = $userAgent->request($updateRequest); $updateResultsString = $updateResponse->content(); # if the '$appendFile' contains MODS XML data: if (($format =~ /^mods$/i) && ($appendFileString =~ /' opening tag from results: $updateResultsString =~ s/^<\?xml.+?\n]*?>\n//ims; # remove '' closing tag from results: $updateResultsString =~ s/^<\/modsCollection>//ims; } # if the '$appendFile' contains SRW_MODS XML data: elsif (($format =~ /^srw(_mods)?$/i) && ($appendFileString =~ /\n]*?>\n//ims; # remove closing tags from results: $updateResultsString =~ s/^\s*<\/srw:records>.*?<\/srw:searchRetrieveResponse>//ims; # TODO: update values in '' and '' } # split search results on (format-specific) record delimiters: @updateResultsData = &splitRecs($updateResultsString); # extract refbase serial number & UNIX time stamp from returned records: %updateResultsRecords = &parseRecs(@updateResultsData); # split '$appendFile' contents on (format-specific) record delimiters: @appendFileData = &splitRecs($appendFileString); # replace existing records with new ones: foreach $record (@appendFileData) { # for each of the records existing in '$appendFile' # NOTE: instead of looping over all existing records, it would probably be more effective # to loop over each of the newly fetched (updated) records instead, and directly replace # all records in '$appendFileString' that have a matching serial number, but I couldn't # get this working correctly... # if this record contains a refbase serial number & UNIX time stamp: if ($record =~ /(?<=show\.php\?record=)(\d+)[^\d\n]+(\d{2}) ([[:alpha:]]{3}) (\d{4}) (\d{2}):(\d{2}):(\d{2})/i) { $serial = $1; if (exists($updateResultsRecords{$serial})) { # if one of the newly fetched (updated) records has an identical serial number # extract time stamp from existing record: $monthday = $2; $month = $months{$3}; # 0 == January $year = $4 - 1900; # by default, Perl counts years from 1900 $hour = $5; $min = $6; $sec = $7; # convert existing record's time stamp to epoch seconds: $modifiedDateExisting = timelocal($sec, $min, $hour, $monthday, $month, $year); # uses Time::Local # extract source data & time stamp from the matching updated record: $updatedRecord = @{$updateResultsRecords{$serial}}[0]; $modifiedDateUpdated = @{$updateResultsRecords{$serial}}[1]; if ($modifiedDateUpdated > $modifiedDateExisting) { # if the updated record has a more recent time stamp # replace existing record with updated record: # (variables '$recStart' and '$recEnd' get defined in subroutine 'splitRecs') $record =~ s/^$recStart.+?exported from refbase.+?show\.php\?record=$serial.+?$recEnd$/$updatedRecord/ims; push(@updatedSerials, $serial); } } } push(@newAppendFileData, $record); } if (! @updatedSerials) { print "There are no records that need to be updated in file '" . $appendFile . "'.\n\n"; } else { # update existing records in variable '$appendFileString': if ($format =~ /^bibtex$/i) { $recDelim = "\n\n"; } else { $recDelim = "\n"; } $appendFileString = join($recDelim, @newAppendFileData); # if the initial query did not return results which aren't yet in '$appendFile': # (i.e. if the 'append' subroutine won't get triggered) if (($resultsString eq '') || (($format =~ /^(mods|srw(_mods)?)$/i) && ($resultsString !~ /]/i))) { # open '$appendFile' in write mode: open(FILEOUT, ">", $appendFile) || die "Can't open file '" . $appendFile . "': $!\n"; # write back updated file: print FILEOUT $appendFileString; # close '$appendFile': close(FILEOUT) || die "Can't close file: $!\n"; } # otherwise, the 'append' subroutine will write contents of '$appendFileString' back to '$appendFile' # to give some feedback, we output all updated records as citations to STDOUT: # TODO: move into a dedicated function if possible (compare with 'append' subroutine) if ($reportResults) { $params{'submit'} = "Cite"; $params{'citeType'} = "ascii"; $params{'showRows'} = "99999"; $params{'headerMsg'} = "Updated records:"; $whereUpdate = ""; # add query clause to restrict search results to updated records: if ($where ne '') { $whereUpdate = $where . " AND "; } $whereUpdate .= 'serial RLIKE "^(' . join('|', @updatedSerials) . ')$"'; $params{'where'} = $whereUpdate; # construct URL: # (uses URI::URL) $updateFeedbackURL = url($host . $script); # send POST request: # (uses HTTP::Request::Common & HTTP::Response) $updateFeedbackRequest = POST $updateFeedbackURL, \%params; $updateFeedbackResponse = $userAgent->request($updateFeedbackRequest); print $updateFeedbackResponse->content(); } } } # -------------------------------------------------------------------------------- # Split '$sourceText' on format-specific record delimiters: # (note that, currently, only formats 'bibtex', 'mods' and 'srw_mods' are supported) sub splitRecs { local ($sourceText) = @_; local (@records) = (); if ($format =~ /^(bibtex|mods|srw(_mods)?)$/i) { # define format-specific strings that open/close a record: # (note that '...' must be used here instead of "...", see Friedl regex book, 1st ed., p.300+) if ($format =~ /^bibtex$/i) { $recStart = '@'; $recEnd = '\}'; } elsif ($format =~ /^mods$/i) { $recStart = '\s*]'; $recEnd = 'mods>'; } elsif ($format =~ /^srw(_mods)?$/i) { $recStart = '\s*]'; $recEnd = 'srw:record>'; } @records = split(/(?<=$recEnd)\s*\n(?=$recStart)/m, $sourceText); } return @records; } # -------------------------------------------------------------------------------- # Extract records containing a refbase serial number & UNIX time stamp from '@records': # Returns a hash of records where each hash element # - is keyed by the record's serial number # - contains a reference to an unnamed array which holds two array elements: # - the record's source data # - the modified date/time stamp converted to epoch seconds sub parseRecs { local (@records) = @_; %refbaseRecords = (); foreach $record (@records) { # extract refbase serial number & UNIX time stamp from this record: if ($record =~ /(?<=show\.php\?record=)(\d+)[^\d\n]+(\d{2}) ([[:alpha:]]{3}) (\d{4}) (\d{2}):(\d{2}):(\d{2})/i) { $serial = $1; $monthday = $2; $month = $months{$3}; # 0 == January $year = $4 - 1900; # by default, Perl counts years from 1900 $hour = $5; $min = $6; $sec = $7; # add modified date to array (after converting to epoch seconds): $modifiedDate = timelocal($sec, $min, $hour, $monthday, $month, $year); # uses Time::Local $record =~ s/\s+$//; # remove any trailing whitespace # note that if '@records' contains several records with the same refbase # serial number, only the last one will be included in '%refbaseRecords' # (and thus only the last one will get updated by the 'update' subroutine) push(@{$refbaseRecords{$serial}}, $record); push(@{$refbaseRecords{$serial}}, $modifiedDate); # NOTE: by storing a reference to an array we can add multiple values per hash key (see Perl Cookbook 5.7) } } return %refbaseRecords; } # -------------------------------------------------------------------------------- # Sort array of records by cite key: # (note that, currently, only formats 'bibtex', 'endnote', 'ris' 'mods' and 'srw_mods' are supported) sub sortRecs { local (@records) = @_; %keyedRecordData = (); @sortedRecordData = (); $i = 1; $fileExtension = "default"; # triggers default regex patterns for extraction of cite IDs # use file-specific regex patterns for extraction of cite IDs: if ($format =~ /^(bibtex|endnote|ris|mods|srw(_mods)?)$/i) { if ($format =~ /^bibtex$/i) { $fileExtension = "bib"; } elsif ($format =~ /^endnote$/i) { $fileExtension = "enw"; } elsif ($format =~ /^ris$/i) { $fileExtension = "ris"; } elsif ($format =~ /^(mods|srw(_mods)?)$/i) { $fileExtension = "xml"; } } $citeIDRegex = @{$citeIDPatterns{$fileExtension}}[0]; # get regex pattern that matches the cite IDs in the records $citeIDNum = @{$citeIDPatterns{$fileExtension}}[1]; # get number of the sub-pattern that captures the cite IDs foreach $record (@records) { # extract cite IDs and use them as hash keys: # (we always append an incrementing number to ensure unique keys; the number # is padded with leading zeros in order to allow for correct string sorting) if ($record =~ /$citeIDRegex/msg) { $citeID = $$citeIDNum . sprintf("-%06d", $i++); } else { # no cite ID found, so we just use an incrementing number as hash key $citeID = sprintf("%06d", $i++); } $keyedRecordData{$citeID} = $record; } @sortedKeys = sort keys %keyedRecordData; foreach $key (@sortedKeys) { push(@sortedRecordData, $keyedRecordData{$key}); } return @sortedRecordData; } # -------------------------------------------------------------------------------- # Remove any duplicate items from '@array': sub uniquify { local (@array) = @_; local (%unique) = (); foreach $item (@array) { $unique{$item}++; } return (sort keys %unique); } # -------------------------------------------------------------------------------- # Print usage and exit: sub usage { local ($status) = @_; print "\nrefbase command line client, v" . $version . " by Matthias Steffens, http://cli.refbase.net/\n\n" . "Usage: refbase [OPTIONS]\n\n" . "Notes: - At least one query option must be given and unrecognized options will be ignored.\n" . " - If multiple options are given, they will by default be connected with 'AND'. Use\n" . " '--query=or' to connect multiple options with 'OR'.\n" . " - Options syntax: [OPTION]=[VALUE], e.g. '-a=steffens' or '--author=\"steffens, m\"'.\n" . " - Returns up to '--rows' number of records beginning with '--start'. If all given\n" . " query options are empty, all database records will be returned.\n" . " - Note that '--records' assumes a list of full record serials separated by non-digit\n" . " characters while '--serial' allows for partial matches.\n" . " - For each option, default values can be specified at the top of the script.\n" . " Current defaults are given in parentheses.\n\n" . "General Options: -h, --help - display this help text\n" . " -v, --version - display version information\n" . " -X, --examples - display usage examples\n\n" . "Query Options: -a, --author - search author field ('" . $params{'author'} . "')\n" . " -b, --abstract - search abstract field ('" . $params{'abstract'} . "')\n" . " -c, --citekey - search cite_key field, requires '-u, --userid' ('" . $params{'cite_key'} . "')\n" . " -d, --date - search by creation date ('" . $params{'date'} . "')\n" . " -e, --area - search area field ('" . $params{'area'} . "')\n" . " -f, --thesis - search thesis field ('" . $params{'thesis'} . "')\n" . " -i, --contribid - search contribution_id field ('" . $params{'contribution_id'} . "')\n" . " -j, --journal - search abbrev_journal field ('" . $params{'abbrev_journal'} . "')\n" . " -k, --keywords - search keywords field ('" . $params{'keywords'} . "')\n" . " -l, --location - search location field ('" . $params{'location'} . "')\n" . " -m, --marked - search marked field, requires '-u, --userid' ('" . $params{'ismarked'} . "')\n" . " -n, --notes - search notes field ('" . $params{'notes'} . "')\n" . " -p, --publication - search publication field ('" . $params{'publication'} . "')\n" . " -q, --query - query type, possible values: and, or ('" . $params{'queryType'} . "')\n" . " -r, --records - search serial field ('" . $params{'records'} . "')\n" . " -s, --selected - search selected field, requires '-u, --userid' ('" . $params{'selected'} . "')\n" . " -t, --title - search title field ('" . $params{'title'} . "')\n" . " -u, --userid - join with user-specific data from user ID ('" . $params{'userID'} . "')\n" . " -w, --where - search by using a raw sql where clause ('" . $params{'where'} . "')\n" . " -x, --type - search type field ('" . $params{'type'} . "')\n" . " -y, --year - search year field ('" . $params{'year'} . "')\n" . " -z, --serial - search serial field (partial matches) ('" . $params{'serial'} . "')\n\n" . "Output Options: -A, --append - file to which returned records are appended ('" . $params{'appendFile'} . "')\n" . " requires '-F, --format': bibtex, mods, srw_mods\n" . " -B, --update - update existing records in '-A, --append' file ('" . $params{'updateRecords'} . "')\n" . " possible values: 0, 1\n" . " -C, --style - citation style ('" . $params{'citeStyle'} . "')\n" . " -E, --extract - file from which citation IDs are extracted ('" . $params{'extractFile'} . "')\n" . " supported file types: " . join(', ', sort keys(%citeIDPatterns)) . "\n" . " -F, --format - output format ('" . $params{'format'} . "')\n" . " possible values: html, rtf, pdf, latex, latex_bbl, markdown, ascii,\n" . " ads, bibtex, endnote, isi, ris, atom, mods, oai_dc,\n" . " odf, srw_dc, srw_mods, word\n" . " -L, --showlinks - hide/display links column in html output ('" . $params{'showLinks'} . "')\n" . " possible values: 0, 1\n" . " -O, --order - sort order of returned records ('" . $params{'citeOrder'} . "')\n" . " possible values: author, year, type, type-year, creation-date\n" . " -Q, --showquery - hide/display SQL query in ASCII output ('" . $params{'showQuery'} . "')\n" . " possible values: 0, 1\n" . " -R, --rows - number of records to be returned ('" . $params{'showRows'} . "')\n" . " -S, --start - number of first record to be returned ('" . $params{'startRecord'} . "')\n" . " -V, --view - view type of html output ('" . $params{'viewType'} . "')\n" . " possible values: web, print, mobile\n\n" . "Server Options: -H, --host - URL of the refbase database ('" . $host . "')\n" . " defined shortcuts: " . join(', ', sort keys(%hosts)) . "\n" . " -P, --password - password for given '-U, --user' account"; if ($loginParams{'loginPassword'} ne '') { print "\n (a default pwd has been defined)\n"; } else { print " ('')\n"; } print " -U, --user - login email address of an existing refbase user\n" . " ('" . $loginParams{'loginEmail'} . "')\n\n"; exit $status; } # -------------------------------------------------------------------------------- # Print version number and exit: sub version { local ($status) = @_; print "\nrefbase command line client, version " . $version . "\ncheck for updates at http://cli.refbase.net/\n\n"; exit $status; } # -------------------------------------------------------------------------------- # Print examples and exit: sub examples { local ($status) = @_; print <<'END_EXAMPLES'; -------------------------------------------------------------------------------- REFBASE USAGE EXAMPLES: -------------------------------------------------------------------------------- 1) Find all records where the author field contains 'mock' AND the year field contains '2005': refbase -a=mock -y=2005 -------------------------------------------------------------------------------- 2) Find all records where the author field contains 'mock' OR the title field contains 'photo', and display 10 records starting with the 21st record in the result set: refbase -a=mock -t=photo -q=or -R=10 -S=21 -------------------------------------------------------------------------------- 3) Export records with serial numbers '1', '12' and '34' to Endnote format and save them to a file named 'export.enw': refbase -r=1,12,34 -F=endnote > export.enw -------------------------------------------------------------------------------- 4) Return up to 50 records that were selected by a user with a user ID '2' in RTF format using citation style "Ann Glaciol" and sorting them first by record type, then by year, and save results to a file named 'citations.rtf': refbase -s=yes -u=2 -R=50 -F=rtf -C="Ann Glaciol" -O=type-year > citations.rtf -------------------------------------------------------------------------------- 5) Find all records which were modified today by a user named "admin" and where the location field contains 'msteffens' (note the use of the '-w' option to specify a custom WHERE clause): refbase -w='modified_date = CURDATE() AND modified_by RLIKE "admin"' -l=msteffens -------------------------------------------------------------------------------- 6) Find all records where the cite_key field (of a user with a user ID '2') contains 'steffens', and append records in MODS XML format to file 'mods.xml' if they don't yet exist in that file: refbase -u=2 -c=steffens -F=mods -A=mods.xml -------------------------------------------------------------------------------- 7) Find all records where the contribution_id field contains 'AWI' and where the keywords field contains 'seaweeds', and append records in BibTeX format to file 'paper.bib' if they don't yet exist in that file. In case found records already exist in file 'paper.bib', update them if their modification date is more recent: refbase -i=AWI -k=seaweeds -F=bibtex -A=paper.bib -B=1 -------------------------------------------------------------------------------- 8) Extract all citation IDs from file 'paper.aux', and append matching records (for a user with a user ID '2') in BibTeX format to file 'paper.bib' if they don't yet exist in that file. In case found records already exist in file 'paper.bib', update them if their modification date is more recent: refbase -u=2 -E=paper.aux -F=bibtex -A=paper.bib -B=1 -------------------------------------------------------------------------------- 9) Extract all citation IDs from file 'bibtex.bbl', and save matching records (for a user with a user ID '2') in LaTeX bibliography (.bbl) format to file 'refbase.bbl' using the "APA" citation style (the .bbl file generated by refbase can be used as a replacement of the BibTeX-generated .bbl file): refbase -u=2 -E=bibtex.bbl -F=latex_bbl -C=APA > refbase.bbl -------------------------------------------------------------------------------- END_EXAMPLES exit $status; } __END__