You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

3938 lines
232 KiB

  1. <?php
  2. // Project: Web Reference Database (refbase) <http://www.refbase.net>
  3. // Copyright: Matthias Steffens <mailto:refbase@extracts.de> and the file's
  4. // original author(s).
  5. //
  6. // This code is distributed in the hope that it will be useful,
  7. // but WITHOUT ANY WARRANTY. Please see the GNU General Public
  8. // License for more details.
  9. //
  10. // File: ./includes/import.inc.php
  11. // Repository: $HeadURL: file:///svn/p/refbase/code/branches/bleeding-edge/includes/import.inc.php $
  12. // Author(s): Matthias Steffens <mailto:refbase@extracts.de>
  13. //
  14. // Created: 13-Jan-06, 21:00
  15. // Modified: $Date: 2013-04-29 21:45:09 +0000 (Mon, 29 Apr 2013) $
  16. // $Author: msteffens $
  17. // $Revision: 1377 $
  18. // This file contains functions
  19. // that are used when importing
  20. // records into the database.
  21. // TODO: I18n
  22. // Import the ActiveLink Packages
  23. require_once("classes/include.php");
  24. import("org.active-link.xml.XML");
  25. import("org.active-link.xml.XMLDocument");
  26. include 'includes/transtab_bibtex_refbase.inc.php'; // include BibTeX markup -> refbase search & replace patterns
  27. include 'includes/transtab_endnotexml_refbase.inc.php'; // include Endnote XML text style markup -> refbase search & replace patterns
  28. if ($contentTypeCharset == "UTF-8") // variable '$contentTypeCharset' is defined in 'ini.inc.php'
  29. include_once 'includes/transtab_latex_unicode.inc.php'; // include LaTeX -> Unicode translation table
  30. else // we assume "ISO-8859-1" by default
  31. include_once 'includes/transtab_latex_latin1.inc.php'; // include LaTeX -> Latin1 translation table
  32. include_once('includes/classes/org/simplepie/SimplePie.compiled.php'); // include the SimplePie library
  33. // --------------------------------------------------------------------
  34. // ISI TO CSA
  35. // This function converts records from "ISI Web of Science" format to "CSA" format
  36. // in order to enable import of ISI WoS records via the 'csaToRefbase()' function.
  37. // ISI WoS records must contain at least the tags "PT" and "SO" and end with "\nER\n".
  38. //
  39. // Authors: this function was originally written by Joachim Almergren <joachim.almergren@umb.no>
  40. // and was re-written by Matthias Steffens <mailto:refbase@extracts.de> to enable batch import
  41. function isiToCsa($isiSourceData)
  42. {
  43. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  44. // Function preferences:
  45. $extractAllAddresses = false; // if set to 'true', all addresses will be extracted from the ISI "C1" field;
  46. // set to 'false' if you only want to extract the first address given in the ISI "C1" field
  47. $extractEmail = true; // if set to 'true', the first email address will be extracted from the ISI "EM" field and appended to the first address in "AF: Affiliation";
  48. // set to 'false' if you don't want to extract the email address
  49. // Generate an array which lists all the CSA tags that are recognized by the 'csaToRefbase()' function
  50. // and match them with their corresponding ISI tags ("CSA tag" => "ISI tag"):
  51. $isiToCsaTagsArray = array(
  52. "PT: Publication Type" => "PT",
  53. "AU: Author" => "AU",
  54. "TI: Title" => "TI",
  55. "SO: Source" => "SO",
  56. "PY: Publication Year" => "PY",
  57. // "JN: Journal Name" => "", // the 'csaToRefbase()' function will generate the full journal name from "SO: Source"
  58. "JA: Abbrev Journal Name" => "JI",
  59. // "MT: Monograph Title" => "", // the ISI WoS database does only contain journal article (AFAIK)
  60. "JV: Journal Volume" => "VL",
  61. "JI: Journal Issue" => "IS",
  62. // "JP: Journal Pages" => "", // ISI WoS contains separate tags for start page ("BP") and end page ("EP"), we'll compute a "JP: Journal Pages" from these fields below
  63. "AF: Affiliation" => "C1",
  64. // "CA: Corporate Author" => "",
  65. "DE: Descriptors" => "DE",
  66. "AB: Abstract" => "AB",
  67. "PB: Publisher" => "PU",
  68. // "" => "PI", // AFAIK, CSA offers no field for the place of publisher (though it would be nice to import this info as well)
  69. // "ED: Editor" => "",
  70. "LA: Language" => "LA",
  71. // "SL: Summary Language" => "",
  72. // "OT: Original Title" => "",
  73. "IS: ISSN" => "SN",
  74. // "IB: ISBN" => "",
  75. // "ER: Environmental Regime" => "",
  76. // "CF: Conference" => "",
  77. "NT: Notes" => "UT", // we'll import the ISI record ID to the notes field
  78. // "DO: DOI" => "DI", // Bibutils apparently recognizes "DI" to extract the DOI from ISI records, however, my tests returned only ISI records where the "DI" field contained some other identifier ?:-/
  79. );
  80. // ----------------------------------------------------------------
  81. // SPLIT INPUT text on the "ER" (= end of record) tag that terminates every ISI record:
  82. $isiRecordsArray = preg_split("/\s*[\r\n]ER *[\r\n]\s*/", $isiSourceData, -1, PREG_SPLIT_NO_EMPTY); // (the 'PREG_SPLIT_NO_EMPTY' flag causes only non-empty pieces to be returned)
  83. $recordsCount = count($isiRecordsArray); // count how many records are available
  84. $csaRecordsArray = array(); // initialize array variable which will hold all records that were converted to CSA format
  85. // ----------------------------------------------------------------
  86. // LOOP OVER EACH RECORD:
  87. for ($i=0; $i<$recordsCount; $i++) // for each record...
  88. {
  89. // we'll only process an array element if it's text does contain the "PT" tag as well as the "SO" tag:
  90. if ((preg_match("/^PT /m", $isiRecordsArray[$i])) AND (preg_match("/^SO /m", $isiRecordsArray[$i]))) // ...process this record:
  91. {
  92. $csaRecordFieldsArray = array(); // initialize array variable which will hold all fields that we've converted to CSA format
  93. // extract first email address from ISI "EM" field:
  94. if (preg_match("/^EM [^ \r\n]+/m", $isiRecordsArray[$i]))
  95. $emailAddress = preg_replace("/.*[\r\n]EM ([^ \r\n]+).*/s", "\\1", $isiRecordsArray[$i]);
  96. else
  97. $emailAddress = "";
  98. // extract start page (ISI "BP" field) and end page (ISI "EP" field):
  99. $pages = array();
  100. if (preg_match("/^BP [^ \r\n]+/m", $isiRecordsArray[$i]))
  101. $pages[] = preg_replace("/.*[\r\n]BP ([^\r\n]+).*/s", "\\1", $isiRecordsArray[$i]);
  102. if (preg_match("/^EP [^ \r\n]+/m", $isiRecordsArray[$i]))
  103. $pages[] = preg_replace("/.*[\r\n]EP ([^\r\n]+).*/s", "\\1", $isiRecordsArray[$i]);
  104. if (!empty($pages))
  105. $pageRange = implode("-", $pages);
  106. // if no start or end page is given, we'll try the ISI "PG" field that indicates the total number of pages:
  107. elseif (preg_match("/^PG [^ \r\n]+/m", $isiRecordsArray[$i]))
  108. $pageRange = preg_replace("/.*[\r\n]PG ([^\r\n]+).*/s", "\\1 pp", $isiRecordsArray[$i]);
  109. else
  110. $pageRange = "";
  111. // split each record into its individual fields:
  112. $isiRecordFieldsArray = preg_split("/[\r\n]+(?=\w\w )/", $isiRecordsArray[$i]);
  113. // LOOP OVER EACH FIELD:
  114. foreach ($isiRecordFieldsArray as $recordField)
  115. {
  116. // we'll only process an array element if it starts with two letters followed by a space
  117. if (preg_match("/^\w\w /", $recordField))
  118. {
  119. // split each field into its tag and its field data:
  120. list($recordFieldTag, $recordFieldData) = preg_split("/(?<=^\w\w) /", $recordField);
  121. foreach ($isiToCsaTagsArray as $csaTag => $isiTag) // for each ISI field that we'd like to convert...
  122. {
  123. if ($recordFieldTag == $isiTag)
  124. {
  125. // replace found ISI field identifier tag with the corresponding CSA tag:
  126. $recordFieldTag = $csaTag;
  127. // add a space to the beginning of any data line that starts with only three spaces (instead of four):
  128. $recordFieldData = preg_replace("/^ (?! )/m", " ", $recordFieldData);
  129. // convert ISI publication type "J" into CSA format ("Journal Article"):
  130. if (($recordFieldTag == "PT: Publication Type") AND ($recordFieldData == "J"))
  131. $recordFieldData = "Journal Article";
  132. // merge multiple authors (that are printed on separate lines) with a semicolon (';') and a space:
  133. elseif ($recordFieldTag == "AU: Author")
  134. $recordFieldData = preg_replace("/\s*[\r\n]\s*/", "; ", $recordFieldData);
  135. // process address info:
  136. elseif ($recordFieldTag == "AF: Affiliation")
  137. {
  138. // remove any trailing punctuation from end of string:
  139. $recordFieldData = preg_replace("/[$punct]+$/$patternModifiers", "", $recordFieldData);
  140. $recordFieldDataArray = array(); // initialize array variable
  141. // if the address data string contains multiple addresses (which are given as one address per line):
  142. if (preg_match("/[\r\n]/", $recordFieldData))
  143. // split address data string into individual addresses:
  144. $recordFieldDataArray = preg_split("/[$punct\s]*[\r\n]\s*/$patternModifiers", $recordFieldData);
  145. else
  146. // use the single address as given:
  147. $recordFieldDataArray[] = $recordFieldData;
  148. // append the first email address from ISI "EM" field to the first address in "AF: Affiliation":
  149. if (($extractEmail) AND (!empty($emailAddress)))
  150. $recordFieldDataArray[0] .= ", Email: " . $emailAddress;
  151. if ($extractAllAddresses)
  152. // merge multiple addresses with a semicolon (';') and a space:
  153. $recordFieldData = implode("; ", $recordFieldDataArray);
  154. else
  155. // use only the first address in "AF: Affiliation":
  156. $recordFieldData = $recordFieldDataArray[0];
  157. }
  158. // if a comma (',') is used as keyword delimiter, we'll convert it into a semicolon (';'):
  159. elseif (($recordFieldTag == "DE: Descriptors") AND (!preg_match("/;/", $recordFieldData)))
  160. $recordFieldData = preg_replace("/ *, */", "; ", $recordFieldData);
  161. // if all of the record data is in uppercase letters, we attempt to convert the string to something more readable:
  162. if ((preg_match("/^[$upper\W\d]+$/$patternModifiers", $recordFieldData)) AND ($isiTag != "UT")) // we exclude the ISI record ID from the ISI "UT" field
  163. // convert upper case to title case (converts e.g. "ELSEVIER SCIENCE BV" into "Elsevier Science Bv"):
  164. // (note that this case transformation won't do the right thing for author initials and abbreviations,
  165. // but the result is better than the whole string being upper case, IMHO)
  166. $recordFieldData = changeCase('title', $recordFieldData); // function 'changeCase()' is defined in 'include.inc.php'
  167. // merge again field tag and data:
  168. $recordField = $recordFieldTag . "\n " . $recordFieldData;
  169. // append this field to array of CSA fields:
  170. $csaRecordFieldsArray[] = $recordField;
  171. // process next ISI field in '$isiRecordFieldsArray':
  172. continue;
  173. }
  174. }
  175. }
  176. }
  177. // append "JP: Journal Pages" field with generated page range to array of CSA fields:
  178. if (!empty($pageRange))
  179. $csaRecordFieldsArray[] = "JP: Journal Pages\n " . $pageRange;
  180. // merge CSA fields into a string and prefix it with a CSA record identifier:
  181. $csaRecord = "Record " . ($i + 1) . " of " . $recordsCount . "\n\n" . implode("\n", $csaRecordFieldsArray);
  182. // append this record to array of CSA records:
  183. $csaRecordsArray[] = $csaRecord;
  184. }
  185. }
  186. // return all CSA records merged into a string:
  187. return implode("\n\n", $csaRecordsArray);
  188. }
  189. // --------------------------------------------------------------------
  190. // CROSSREF TO REFBASE
  191. // This function converts records from CrossRef's "unixref" XML format into the standard "refbase"
  192. // array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
  193. //
  194. // NOTES: - So far, CrossRef seems to be the only provider of this data format & they do not yet
  195. // use it to return more than one result. However, since function 'fetchDataFromCrossRef()'
  196. // can fetch data for multiple DOIs and appends found records to '$sourceText', this
  197. // function does allow for batch import (though we might need to tweak '$recordDelimiter'
  198. // if multiple results would be returned directly by CrossRef).
  199. //
  200. // - This is our first and only native XML import format, so we do not use functions
  201. // 'validateRecords()' or 'parseRecords()'.
  202. //
  203. // TODO:
  204. // - one of these, in order of preference:
  205. // - change functions 'validateRecords()' & 'parseRecords()' to accept non-tagged, XML references
  206. // - add new XML validation/parsing functions
  207. // - transform XML to a tagged format
  208. //
  209. // Authors: Richard Karnesky <mailto:karnesky@gmail.com> and
  210. // Matthias Steffens <mailto:refbase@extracts.de>
  211. function crossrefToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
  212. {
  213. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  214. global $contentTypeCharset; // defined in 'ini.inc.php'
  215. global $errors;
  216. global $showSource;
  217. // Pattern by which the input text will be split into individual records:
  218. $recordDelimiter = "(\s*(<\?xml[^<>]*\?>\s*)?<doi_records[^<>]*>)?\s*(?=<doi_record[^<>]*>)" // splits before '<doi_record>'
  219. . "|"
  220. . "(?<=<\/doi_record>)\s*(<\/doi_records>\s*)?"; // splits after '</doi_record>'
  221. // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
  222. // (Notes: - name standardization occurs after multiple author fields have been merged by '; '
  223. // - the split pattern must be specified as perl-style regular expression (including the leading & trailing
  224. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  225. $personDelimiter = "/ *; */";
  226. // Pattern by which a person's family name is separated from the given name (or initials):
  227. // (the split pattern must be specified as perl-style regular expression (including the leading & trailing
  228. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  229. $familyNameGivenNameDelimiter = "/ *, */";
  230. // Specifies whether the person's family name comes first within a person's name
  231. // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
  232. $familyNameFirst = true;
  233. // Specifies whether a person's full given name(s) shall be shortened to initial(s):
  234. // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
  235. // - if set to 'false', given names (and any initials) are taken as is
  236. // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
  237. $shortenGivenNames = true;
  238. // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
  239. $transformCase = true;
  240. // Postprocessor actions:
  241. // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
  242. // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
  243. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  244. // "/Search Pattern/" => "Replace Pattern"
  245. $postprocessorActionsArray = array(
  246. array(
  247. 'fields' => array("title"),
  248. 'actions' => array(
  249. "/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
  250. )
  251. ),
  252. array(
  253. 'fields' => array("author", "title", "publication", "abbrev_journal"), // convert HTML font attributes (which some publishers include in their CrossRef data, see e.g. doi:10.1515/BOT.2001.002)
  254. 'actions' => array(
  255. "/<sup>(.+?)<\/sup>/i" => "[super:\\1]", // replace '<sup>...</sup>' with refbase markup ('[super:...]')
  256. "/<sub>(.+?)<\/sub>/i" => "[sub:\\1]", // replace '<sub>...</sub>' with refbase markup ('[sub:...]')
  257. "/<i>(.+?)<\/i>/i" => "_\\1_", // replace '<i>...</i>' with refbase markup ('_..._')
  258. "/<b>(.+?)<\/b>/i" => "**\\1**" // replace '<b>...</b>' with refbase markup ('**...**')
  259. )
  260. ),
  261. array(
  262. 'fields' => array("author", "title", "year", "publication", "abbrev_journal", "volume", "issue", "pages", "issn", "url", "doi"), // not sure whether we need to do this for all fields, it occurs e.g. for doi:10.1007/BF00391383 in the 'url' field
  263. 'actions' => array(
  264. "/<!\[CDATA\[(.+?)\]\]>/" => "\\1" // remove any '<![CDATA[...]]>' wrapper
  265. )
  266. )
  267. );
  268. // -----------------------------------------
  269. // PRE-PROCESS SOURCE TEXT:
  270. // Split input text into individual records:
  271. $recordArray = splitSourceText($sourceText, $recordDelimiter, false);
  272. // PROCESS SOURCE DATA:
  273. // Initialize array variables:
  274. $parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
  275. // NOTE: We do NOT validate records yet, i.e. we assume that they are perfect and attempt to import all of them:
  276. $importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format
  277. $importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format
  278. $recordsCount = count($recordArray); // count how many records are available
  279. // -----------------------------------------
  280. // LOOP OVER EACH RECORD:
  281. for ($i=0; $i<$recordsCount; $i++) // for each record...
  282. {
  283. $fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
  284. // Parse record XML:
  285. $XML = new XML($recordArray[$i]);
  286. // Check for any errors:
  287. $errorXML = $XML->getBranches("doi_record/crossref","error");
  288. if (!empty($errorXML[0]))
  289. {
  290. $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
  291. $crossRefError = $errorXML[0]->getTagContent("error"); // e.g. "DOI not found in CrossRef"
  292. // Prepare an appropriate error message:
  293. $errorMessage = "Record " . ($i + 1) . ": " . $crossRefError . "!";
  294. if (!isset($errors["sourceText"]))
  295. $errors["sourceText"] = $errorMessage;
  296. else
  297. $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
  298. }
  299. else // a DOI record was found
  300. {
  301. // NOTE: We do NOT yet validate any found records, i.e. for now, we'll just assume that they are ok:
  302. $importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number)
  303. $fieldParametersArray['type'] = 'Journal Article'; // MOST CrossRef entitites are journal articles. TODO: find what isn't & fix the type
  304. // Parse main XML branches:
  305. $metadataXML = $XML->getBranches("doi_record/crossref/journal","journal_metadata");
  306. $issueXML = $XML->getBranches("doi_record/crossref/journal","journal_issue");
  307. $articleXML = $XML->getBranches("doi_record/crossref/journal","journal_article");
  308. $contributorsXML = $XML->getBranches("doi_record/crossref/journal/journal_article/contributors","person_name");
  309. // Process '$metadataXML':
  310. // TODO:
  311. // - Put CODEN and/or publisher-specific article IDs ('<publisher_item>') into the 'notes' field (?)
  312. $fieldParametersArray['publication'] = $metadataXML[0]->getTagContent("journal_metadata/full_title");
  313. $fieldParametersArray['abbrev_journal'] = $metadataXML[0]->getTagContent("journal_metadata/abbrev_title");
  314. // Get print ISSN ('media_type="print"')
  315. $issnXML = $XML->getBranches("doi_record/crossref/journal/journal_metadata","issn","media_type","print");
  316. if (!empty($issnXML[0]))
  317. $issn = $issnXML[0]->getTagContent("issn");
  318. else // if there's no ISSN tag with attribute 'media_type="print"', we fall back to the first given ISSN tag (if any)
  319. {
  320. $issnXML = $XML->getBranches("doi_record/crossref/journal/journal_metadata","issn");
  321. if (!empty($issnXML[0]))
  322. $issn = $issnXML[0]->getTagContent("issn");
  323. else
  324. $issn = "";
  325. }
  326. if (!empty($issn))
  327. $fieldParametersArray['issn'] = $issn;
  328. // Process '$issueXML':
  329. $fieldParametersArray['year'] = $issueXML[0]->getTagContent("journal_issue/publication_date/year");
  330. $fieldParametersArray['volume'] = $issueXML[0]->getTagContent("journal_issue/journal_volume/volume");
  331. $fieldParametersArray['issue'] = $issueXML[0]->getTagContent("journal_issue/issue");
  332. // Proccess '$articleXML':
  333. $fieldParametersArray['title'] = $articleXML[0]->getTagContent("journal_article/titles/title");
  334. // - Append any subtitle to the main title:
  335. $subTitleXML = $articleXML[0]->getBranches("journal_article/titles","subtitle");
  336. if (!empty($subTitleXML[0]))
  337. $fieldParametersArray['title'] .= ": " . $subTitleXML[0]->getTagContent("subtitle");
  338. $fieldParametersArray['doi'] = $articleXML[0]->getTagContent("journal_article/doi_data/doi");
  339. $fieldParametersArray['url'] = $articleXML[0]->getTagContent("journal_article/doi_data/resource");
  340. $fieldParametersArray['startPage'] = $articleXML[0]->getTagContent("journal_article/pages/first_page");
  341. $fieldParametersArray['endPage'] = $articleXML[0]->getTagContent("journal_article/pages/last_page");
  342. // Process '$contributorsXML':
  343. // TODO:
  344. // - Differentiate authors from other types of contributors
  345. $author = "";
  346. foreach ($contributorsXML as $contributor)
  347. {
  348. $givenName = $contributor->getTagContent("person_name/given_name");
  349. $familyName = $contributor->getTagContent("person_name/surname");
  350. // If the author's family (or given) name is entirely in uppercase letters, we attempt to convert the string to something more readable:
  351. if ($transformCase)
  352. {
  353. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $familyName))
  354. // Convert upper case to title case (converts e.g. "STEFFENS" into "Steffens"):
  355. $familyName = changeCase('title', $familyName); // function 'changeCase()' is defined in 'include.inc.php'
  356. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $givenName))
  357. // Convert upper case to title case (converts e.g. "MATTHIAS" into "Matthias"):
  358. $givenName = changeCase('title', $givenName);
  359. }
  360. // Append any name suffix to the surname:
  361. $nameSuffixXML = $contributor->getBranches("person_name","suffix");
  362. if (!empty($nameSuffixXML[0]))
  363. $familyName .= " " . $nameSuffixXML[0]->getTagContent("suffix");
  364. $author .= $familyName . ", " . $givenName . "; ";
  365. }
  366. $author = trim($author, "; ");
  367. $fieldParametersArray['author'] = $author;
  368. // Standardize field data contained in '$fieldParametersArray':
  369. foreach ($fieldParametersArray as $fieldKey => $fieldData)
  370. {
  371. // Decode HTML special chars:
  372. if (($fieldKey != "url") AND preg_match('/&(amp|quot|#0?39|lt|gt);/', $fieldData))
  373. $fieldParametersArray[$fieldKey] = decodeHTMLspecialchars($fieldData); // function 'decodeHTMLspecialchars()' is defined in 'include.inc.php'
  374. elseif (($fieldKey == "url") AND preg_match('/&amp;/', $fieldData)) // in case of the 'url' field, we just decode any ampersand characters
  375. $fieldParametersArray[$fieldKey] = str_replace('&amp;', '&', $fieldData);
  376. }
  377. // Function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference:
  378. $fieldParametersArray = standardizeFieldData($fieldParametersArray, "CrossRef XML", $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray);
  379. // Append the array of extracted field data to the main data array which holds all records to import:
  380. $parsedRecordsArray[] = $fieldParametersArray;
  381. }
  382. }
  383. // -----------------------------------------
  384. // Build refbase import array:
  385. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  386. "1.0", // 'version' - the version of the given array structure
  387. "http://refbase.net/import/crossref/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  388. "Richard Karnesky", // 'author' - author/contact name of the person who's responsible for this script/importer
  389. "karnesky@users.sourceforge.net", // 'contact' - author's email/contact address
  390. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  391. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  392. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  393. }
  394. // --------------------------------------------------------------------
  395. // ARXIV TO REFBASE
  396. // This function converts records from arXiv.org's Atom XML Opensearch format into the standard "refbase"
  397. // array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
  398. // Info on the arXiv API (including sample code) is given at:
  399. // <http://export.arxiv.org/api_help/>
  400. // <http://export.arxiv.org/api_help/docs/user-manual.html>
  401. // <http://export.arxiv.org/api_help/docs/examples/php_arXiv_parsing_example.txt>
  402. //
  403. // Requires the SimplePie library (by Ryan Parman and Geoffrey Sneddon), which is
  404. // available under the BSD license from: <http://simplepie.org>
  405. //
  406. // '$feed' must contain the list of Atom feed items given as a SimplePie object
  407. function arxivToRefbase(&$feed, $importRecordsRadio, $importRecordNumbersArray)
  408. {
  409. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  410. global $contentTypeCharset; // defined in 'ini.inc.php'
  411. global $errors;
  412. global $showSource;
  413. // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
  414. // (Notes: - name standardization occurs after multiple author fields have been merged by '; '
  415. // - the split pattern must be specified as perl-style regular expression (including the leading & trailing
  416. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  417. $personDelimiter = "/ *; */";
  418. // Pattern by which a person's family name is separated from the given name (or initials):
  419. // (the split pattern must be specified as perl-style regular expression (including the leading & trailing
  420. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  421. $familyNameGivenNameDelimiter = "/ (?=([$upper]+[-$alpha]+)( *;|$))/$patternModifiers";
  422. // Specifies whether the person's family name comes first within a person's name
  423. // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
  424. $familyNameFirst = false;
  425. // Specifies whether a person's full given name(s) shall be shortened to initial(s):
  426. // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
  427. // - if set to 'false', given names (and any initials) are taken as is
  428. // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
  429. $shortenGivenNames = true;
  430. // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
  431. $transformCase = true;
  432. // Postprocessor actions:
  433. // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
  434. // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
  435. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  436. // "/Search Pattern/" => "Replace Pattern"
  437. $postprocessorActionsArray = array(
  438. array(
  439. 'fields' => array("title", "abstract", "notes"),
  440. 'actions' => array(
  441. "/ *[\n\r]+ */" => " " // transform whitespace: replace any run of whitespace that includes newline/return character(s) with a space
  442. )
  443. ),
  444. array(
  445. 'fields' => array("title"),
  446. 'actions' => array(
  447. "/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
  448. )
  449. )
  450. );
  451. // -----------------------------------------
  452. // PROCESS SOURCE DATA:
  453. // Initialize array variables:
  454. $parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
  455. // NOTE: We do NOT validate records yet, i.e. we assume that they are perfect and attempt to import all of them:
  456. $importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format
  457. $importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format
  458. // Use these namespaces to retrieve tags:
  459. $atomNamespace = 'http://www.w3.org/2005/Atom';
  460. $opensearchNamespace = 'http://a9.com/-/spec/opensearch/1.1/';
  461. $arxivNamespace = 'http://arxiv.org/schemas/atom';
  462. // Get feed data:
  463. $recordArray = $feed->get_items(); // fetch all feed items into an array
  464. $recordsCount = count($recordArray); // count how many records are available
  465. // -----------------------------------------
  466. // LOOP OVER EACH RECORD:
  467. for ($i=0; $i<$recordsCount; $i++) // for each record...
  468. {
  469. $fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
  470. $record = $recordArray[$i]; // this will make things a bit more readable
  471. // Check for any errors:
  472. if ($record->get_title() == "Error")
  473. {
  474. $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
  475. $arXivError = $record->get_description(); // e.g. "incorrect id format for 1234.12345"
  476. // Prepare an appropriate error message:
  477. $errorMessage = "Record " . ($i + 1) . ": " . $arXivError . "!";
  478. if (!isset($errors["sourceText"]))
  479. $errors["sourceText"] = $errorMessage;
  480. else
  481. $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
  482. }
  483. elseif (!($record->get_permalink())) // empty record (which has no matching arXiv ID); ATM, this occurs e.g. when querying for "arXiv:1234.9999"
  484. {
  485. $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
  486. // Prepare an appropriate error message:
  487. $errorMessage = "Record " . ($i + 1) . ": nothing found!";
  488. if (!isset($errors["sourceText"]))
  489. $errors["sourceText"] = $errorMessage;
  490. else
  491. $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
  492. }
  493. else // an arXiv record was found
  494. {
  495. // NOTE: We do NOT yet validate any found records, i.e. for now, we'll just assume that they are ok:
  496. $importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number)
  497. // Extract elements of the current Atom XML entry:
  498. // - type:
  499. $fieldParametersArray['type'] = 'Journal Article'; // NOTE: Are all arXiv records journal articles? TODO: find what isn't & fix the type
  500. // - id:
  501. $fieldParametersArray['notes'] = str_replace("http://arxiv.org/abs/", "arXiv:", $record->get_permalink()); // extract the arXiv ID from the abstract URL in the 'id' element & prefix it with "arXiv:"
  502. // - title:
  503. $fieldParametersArray['title'] = $record->get_title();
  504. // - summary:
  505. if ($abstract = $record->get_description())
  506. $fieldParametersArray['abstract'] = $abstract;
  507. // - author:
  508. // NOTE: If we didn't want to extract author affiliation info, we could just use standard SimplePie functions ('get_authors()' and 'get_name()')
  509. $authorsArray = array();
  510. $addressArray = array();
  511. $authors = $record->get_item_tags($atomNamespace, 'author');
  512. foreach ($authors as $author)
  513. {
  514. $authorName = "";
  515. $authorLastName = "";
  516. $authorAddressArray = "";
  517. if (isset($author['child'][$atomNamespace]['name']) AND ($authorName = $author['child'][$atomNamespace]['name'][0]['data']))
  518. {
  519. // -- name:
  520. // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1:
  521. // NOTE: For authors, we need to perform charset conversion up here (and not further down below, as is done for all the other fields),
  522. // since otherwise the below '$upper' and '$alpha' character class elements would fail to match!
  523. if (($contentTypeCharset == "ISO-8859-1") AND (detectCharacterEncoding($authorName) == "UTF-8")) // function 'detectCharacterEncoding()' is defined in 'include.inc.php'
  524. $authorName = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorName, "UTF-8"); // function 'convertToCharacterEncoding()' is defined in 'include.inc.php'
  525. // Change the formatting of author names to the one used by refbase, i.e. the family name comes first, and a comma separates family name & initials:
  526. // (further standardisation of person names is done in function 'standardizeFieldData()'; see also note for '$familyNameGivenNameDelimiter' above)
  527. // NOTE: With the above settings for '$familyNameGivenNameDelimiter' and '$familyNameFirst' this isn't necessary anymore
  528. // $authorName = preg_replace("/^(.+?) +([$upper]+[-$alpha]+)$/$patternModifiers", "\\2, \\1", $authorName);
  529. $authorsArray[] = $authorName;
  530. // -- arxiv:affiliation:
  531. if (isset($author['child'][$arxivNamespace]) AND ($authorAffiliations = $author['child'][$arxivNamespace]['affiliation']))
  532. {
  533. foreach ($authorAffiliations as $authorAffiliation)
  534. $authorAddressArray[] = $authorAffiliation['data'];
  535. $authorAddresses = implode(", ", $authorAddressArray);
  536. // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1:
  537. if (($contentTypeCharset == "ISO-8859-1") AND (detectCharacterEncoding($authorAddresses) == "UTF-8"))
  538. $authorAddresses = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $authorAddresses, "UTF-8");
  539. $authorLastName = preg_replace("/^([$upper]+[-$alpha]+).+$/$patternModifiers", "\\1", $authorName); // extract authors last name
  540. $addressArray[] = $authorLastName . ": " . $authorAddresses;
  541. }
  542. }
  543. }
  544. if (!empty($authorsArray))
  545. $fieldParametersArray['author'] = implode("; ", $authorsArray); // merge multiple authors
  546. if (!empty($addressArray))
  547. $fieldParametersArray['address'] = implode("; ", $addressArray); // merge multiple author affiliations
  548. // - links:
  549. //
  550. // TODO: Currently, we just copy a link to the PDF to the 'file' field. It might be desirable to fetch the actual PDF and store it on the refbase server.
  551. //
  552. // NOTE: - In order to extract any links, we access the raw SimplePie object here; This is done since, in SimplePie v1.1.1, the standard SimplePie functions
  553. // 'get_link()' and 'get_links()' only support checking for the 'rel' attribute, but don't allow to filter on the 'type' or 'title' attribute. However,
  554. // we need to check the 'type' & 'title' attributes in order to assign PDF & DOI links to the 'file' & 'doi' fields, respectively. Alternatively, we
  555. // could also get this information from the URL itself, but that may fail if arXiv changes its URL pattern.
  556. // - More info on how to grab custom tags or attributes: <http://simplepie.org/wiki/tutorial/grab_custom_tags_or_attributes>
  557. $links = $record->get_item_tags($atomNamespace, 'link');
  558. foreach ($links as $link)
  559. {
  560. if (isset($link['attribs']['']['href']))
  561. {
  562. // -- file:
  563. if (!isset($fieldParametersArray['file']) AND isset($link['attribs']['']['title']) AND ($link['attribs']['']['title'] == "pdf")) // we could also check for 'type="application/pdf"'
  564. $fieldParametersArray['file'] = $link['attribs']['']['href'];
  565. // -- url:
  566. // NOTE: the 'id' element (see above) also contains the URL to the abstract page for the current article
  567. elseif (!isset($fieldParametersArray['url']) AND isset($link['attribs']['']['type']) AND ($link['attribs']['']['type'] == "text/html")) // we could also check for 'title' being unset
  568. $fieldParametersArray['url'] = $link['attribs']['']['href'];
  569. // -- doi:
  570. // NOTE: the 'arxiv:doi' element also contains the DOI for the current article
  571. elseif (!isset($fieldParametersArray['doi']) AND isset($link['attribs']['']['title']) AND ($link['attribs']['']['title'] == "doi"))
  572. $fieldParametersArray['doi'] = str_replace("http://dx.doi.org/", "", $link['attribs']['']['href']);
  573. }
  574. }
  575. // - arxiv:comment:
  576. if ($comment = $record->get_item_tags($arxivNamespace, 'comment'))
  577. $fieldParametersArray['notes'] .= "; " . $comment[0]['data']; // TODO: if arXiv records can include multiple comments, we'd need to loop over all of them
  578. // - arxiv:primary_category:
  579. // TODO: Should we copy the term given in the 'arxiv:primary_category' element to the 'area' field?
  580. // - arxiv:category:
  581. $categoriesArray = array();
  582. foreach ($record->get_categories() as $category)
  583. $categoriesArray[] = $category->get_label();
  584. if (!empty($categoriesArray))
  585. $fieldParametersArray['keywords'] = implode("; ", $categoriesArray); // merge multiple categories
  586. // - arxiv:journal_ref:
  587. if ($journalRef = $record->get_item_tags($arxivNamespace, 'journal_ref'))
  588. {
  589. // We extract the full 'journal_ref' string into its own variable since we're going to mess with it:
  590. $journalRefData = preg_replace("/ *[\n\r]+ */", " ", $journalRef[0]['data']); // transform whitespace: replace any run of whitespace that includes newline/return character(s) with a space
  591. // NOTE: The formatting of the 'journal_ref' string can vary heavily, so
  592. // the below parsing efforts may fail. Therefore, we'll also copy the
  593. // original 'journal_ref' string to the 'notes' field, and display it
  594. // in the header message when importing single records.
  595. $fieldParametersArray['source'] = $journalRefData;
  596. $fieldParametersArray['notes'] .= "; Journal Ref: " . $journalRefData;
  597. // Extract source info from the 'journal_ref' string into the different fields:
  598. // NOTE: We try to use reasonably liberal (and thus rather ugly!) regex patterns
  599. // which should catch most of the commonly used formatting styles. However,
  600. // as noted above, due to the varying formatting of the 'journal_ref' string,
  601. // this may not be always entirely successful.
  602. // TODO: Extract ISSN from the 'journal_ref' string (see e.g. 'arXiv:cond-mat/0506611v1')
  603. // -- journal:
  604. $journalName = preg_replace("/^(.+?)(?= *(\(?\d+|[,;]|(v(ol)?\.?|volume) *\d+|$)).*/i", "\\1", $journalRefData); // extract journal name
  605. $journalRefData = preg_replace("/^(.+?)(?= *(\(?\d+|[,;]|(v(ol)?\.?|volume) *\d+|$))[,; ]*/i", "", $journalRefData); // remove journal name from 'journal_ref' string
  606. if (preg_match("/\./", $journalName))
  607. $fieldParametersArray['abbrev_journal'] = preg_replace("/(?<=\.)(?![ )]|$)/", " ", $journalName); // ensure that any dots are followed with a space
  608. else
  609. $fieldParametersArray['publication'] = $journalName;
  610. // -- volume:
  611. // NOTE: The volume is assumed to be the first number that follows the journal name, and
  612. // which is followed by another four-digit number (which is asssumed to be the year).
  613. if (preg_match("/^(?:(?:v(?:ol)?\.?|volume) *)?(\w*\d+\w*)(?= *.*?\d{4})/i", $journalRefData))
  614. {
  615. $fieldParametersArray['volume'] = preg_replace("/^(?:(?:v(?:ol)?\.?|volume) *)?(\w*\d+\w*)(?= *.*?\d{4}).*/i", "\\1", $journalRefData); // extract volume
  616. $journalRefData = preg_replace("/^(?:(?:v(?:ol)?\.?|volume) *)?(\w*\d+\w*)(?= *.*?\d{4})[,; ]*/i", "", $journalRefData); // remove volume from 'journal_ref' string
  617. }
  618. // -- year (take 1):
  619. // NOTE: For the first take, we assume the year to be the first occurrence of a four-digit number
  620. // that's wrapped in parentheses.
  621. if (preg_match("/\(\d{4}\)/i", $journalRefData))
  622. {
  623. $fieldParametersArray['year'] = preg_replace("/^.*?\((\d{4})\).*?$/i", "\\1", $journalRefData); // extract year
  624. $journalRefData = preg_replace("/[,; ]*\(\d{4}\)[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string
  625. }
  626. // -- issue:
  627. // NOTE: The issue is only recognized if it is preceded with a "n/no/number" prefix, or if it is a
  628. // number with less than four digits that is enclosed in parentheses (we can check for the latter
  629. // case since four-digit years that are wrapped in parens have already been removed). The regex
  630. // patterns below also try to account for some non-digit characters in issue numbers.
  631. // TODO: Support issue extraction from "Journal Vol:No ..." format (see e.g. 'arXiv:cond-mat/0703452v2')
  632. if (preg_match("/(?:(?:n\.|no\.?|number) *)(\w*[\d\/-]+\w*)|\((\w*(?:\d{1,3}|\d{1,2}[\/-]+\d{1,2})\w*)\)/i", $journalRefData)) // matches e.g. "no. 2", "Number 2" or "(1/2)"
  633. {
  634. $fieldParametersArray['issue'] = preg_replace("/^.*?(?:(?:(?:n\.|no\.?|number) *)(\w*[\d\/-]+\w*)|\((\w*(?:\d{1,3}|\d{1,2}[\/-]+\d{1,2})\w*)\)).*?$/i", "\\1\\2", $journalRefData); // extract issue
  635. $journalRefData = preg_replace("/[,; ]*(?:(?:(?:n\.|no\.?|number) *)(\w*[\d\/-]+\w*)|\((\w*(?:\d{1,3}|\d{1,2}[\/-]+\d{1,2})\w*)\))[,; ]*/i", "", $journalRefData); // remove issue from 'journal_ref' string
  636. }
  637. // -- pages (take 1):
  638. // NOTE: For the first take, we assume the pages to be either preceded with a "p/pp" prefix, or to
  639. // be a page range.
  640. if (preg_match("/(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*)/i", $journalRefData)) // matches e.g. "p. 167-188", "pp.361--364" or "197-209"
  641. {
  642. $fieldParametersArray['startPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*)).*?$/i", "\\1\\3", $journalRefData); // extract starting page
  643. $fieldParametersArray['endPage'] = preg_replace("/^.*?(?:(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*)).*?$/i", "\\2\\4", $journalRefData); // extract ending page
  644. $journalRefData = preg_replace("/[,; ]*(?:(?:p(?:p)?\.? *)(\w*\d+\w*)(?: *-+ *(\w*\d+\w*))?|(?:p(?:p)?\.? *)?(\w*\d+\w*) *-+ *(\w*\d+\w*))[,; ]*/i", "", $journalRefData); // remove page info from 'journal_ref' string
  645. }
  646. // -- year (take 2):
  647. // NOTE: For the second take, we assume the year to be the first occurrence of any four-digit number
  648. // in the remaining 'journal_ref' string.
  649. if (!isset($fieldParametersArray['year']) AND preg_match("/\b\d{4}\b/i", $journalRefData))
  650. {
  651. $fieldParametersArray['year'] = preg_replace("/^.*?\b(\d{4})\b.*?$/i", "\\1", $journalRefData); // extract year
  652. $journalRefData = preg_replace("/[,; ]*\b\d{4}\b[,; ]*/i", " ", $journalRefData); // remove year from 'journal_ref' string
  653. }
  654. // -- pages (take 2):
  655. // NOTE: For the second take, we assume the page info to be any number that is at the beginning of
  656. // the remaining 'journal_ref' string.
  657. if (!isset($fieldParametersArray['startPage']) AND preg_match("/^[,; ]*\w*\d+\w*/i", $journalRefData))
  658. {
  659. $fieldParametersArray['startPage'] = preg_replace("/^[,; ]*(\w*\d+\w*).*?$/i", "\\1", $journalRefData); // extract page info
  660. }
  661. }
  662. // Standardize field data contained in '$fieldParametersArray':
  663. foreach ($fieldParametersArray as $fieldKey => $fieldData)
  664. {
  665. // In case of a latin1-based database, attempt to convert UTF-8 data to refbase markup & latin1:
  666. // (we exclude the 'author' and 'address' fields here since they have already been dealt with above)
  667. if ((!preg_match("/^(author|address)$/", $fieldKey)) AND ($contentTypeCharset == "ISO-8859-1") AND (detectCharacterEncoding($fieldData) == "UTF-8"))
  668. $fieldData = convertToCharacterEncoding("ISO-8859-1", "TRANSLIT", $fieldData, "UTF-8");
  669. // Decode HTML special chars:
  670. if (($fieldKey != "url") AND preg_match('/&(amp|quot|#0?39|lt|gt);/', $fieldData))
  671. $fieldParametersArray[$fieldKey] = decodeHTMLspecialchars($fieldData); // function 'decodeHTMLspecialchars()' is defined in 'include.inc.php'
  672. elseif (($fieldKey == "url") AND preg_match('/&amp;/', $fieldData)) // in case of the 'url' field, we just decode any ampersand characters
  673. $fieldParametersArray[$fieldKey] = str_replace('&amp;', '&', $fieldData);
  674. }
  675. // Function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference:
  676. $fieldParametersArray = standardizeFieldData($fieldParametersArray, "arXiv XML", $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray);
  677. // Append the array of extracted field data to the main data array which holds all records to import:
  678. $parsedRecordsArray[] = $fieldParametersArray;
  679. }
  680. }
  681. // -----------------------------------------
  682. // Build refbase import array:
  683. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  684. "1.0", // 'version' - the version of the given array structure
  685. "http://refbase.net/import/arxiv/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  686. "Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
  687. "refbase@extracts.de", // 'contact' - author's email/contact address
  688. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  689. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  690. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  691. }
  692. // --------------------------------------------------------------------
  693. // RIS TO REFBASE
  694. // This function converts records from Reference Manager (RIS) format into the standard "refbase"
  695. // array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
  696. function risToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
  697. {
  698. global $contentTypeCharset; // defined in 'ini.inc.php'
  699. global $errors;
  700. global $showSource;
  701. // Define regular expression patterns that will facilitate parsing of RIS data:
  702. // (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
  703. // Pattern by which the input text will be split into individual records:
  704. $recordDelimiter = "\s*[\r\n]ER - *[\r\n]*\s*";
  705. // Pattern by which records will be split into individual fields:
  706. $fieldDelimiter = "[\r\n]+(?=\w\w - )";
  707. // Pattern by which fields will be split into their field label (tag) and field data:
  708. $dataDelimiter = "(?<=^\w\w) - ";
  709. // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
  710. // (Notes: - name standardization occurs after multiple author fields have been merged by '; '
  711. // - the split pattern must be specified as perl-style regular expression (including the leading & trailing
  712. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  713. $personDelimiter = "/ *; */";
  714. // Pattern by which a person's family name is separated from the given name (or initials):
  715. // (the split pattern must be specified as perl-style regular expression (including the leading & trailing
  716. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  717. $familyNameGivenNameDelimiter = "/ *, */";
  718. // Specifies whether the person's family name comes first within a person's name
  719. // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
  720. $familyNameFirst = true;
  721. // Specifies whether a person's full given name(s) shall be shortened to initial(s):
  722. // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
  723. // - if set to 'false', given names (and any initials) are taken as is
  724. // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
  725. $shortenGivenNames = true;
  726. // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
  727. $transformCase = true;
  728. // Preprocessor actions:
  729. // Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
  730. // (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
  731. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  732. // "/Search Pattern/" => "Replace Pattern"
  733. $preprocessorActionsArray = array(
  734. array(
  735. 'match' => "/&#?\w+;/", // if HTML encoded text (such as "&auml;", "&#xF6;" or "&#233;") occurs in the source data
  736. 'actions' => array(
  737. "/(&#?\w+;)/e" => "html_entity_decode('\\1', ENT_QUOTES, '$contentTypeCharset')" // HTML decode source data (see <http://www.php.net/manual/en/function.html-entity-decode.php>)
  738. )
  739. )
  740. );
  741. // Postprocessor actions:
  742. // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
  743. // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
  744. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  745. // "/Search Pattern/" => "Replace Pattern"
  746. $postprocessorActionsArray = array(
  747. array(
  748. 'fields' => array("year"),
  749. 'actions' => array(
  750. "/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
  751. )
  752. ),
  753. array(
  754. 'fields' => array("title"),
  755. 'actions' => array(
  756. "/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
  757. )
  758. ),
  759. array(
  760. 'fields' => array("notes"),
  761. 'actions' => array(
  762. "/(; ?)?exported from refbase \(http[^ ]+ last updated.+?\d{2}:\d{2}:\d{2} [+-]\d{4}/" => "" // remove refbase attribution string (such as "exported from refbase (http://localhost/refs/show.php?record=12345), last updated on Sat, 15 Jul 2006 22:24:16 +0200")
  763. )
  764. ),
  765. array(
  766. 'fields' => array("url"),
  767. 'actions' => array(
  768. "/^PM:(\d+)$/i" => "http://www.ncbi.nlm.nih.gov/pubmed/\\1" // convert "PM:17302433" into a resolvable PubMed URL; Bibutils 'xml2ris' (<= v3.40) converts "<identifier type="pubmed">17302433</identifier>" to "UR - PM:17302433"
  769. )
  770. ),
  771. array(
  772. 'fields' => array("doi"),
  773. 'actions' => array(
  774. "#^(?:.*?)(?=10\.\d{4}/\S+?)|^(?!10\.\d{4}/\S+?).+$#" => "", // remove any text before the DOI (this also deletes the field's contents if it doesn't contain a DOI at all)
  775. "#(10\.\d{4}/\S+?)(?=$|\s).*$#" => "\\1" // remove any text after the DOI
  776. )
  777. ),
  778. array(
  779. 'fields' => array("title", "address", "keywords", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert font attributes (which some publishers include in RIS records that are available on their web pages)
  780. 'actions' => array(
  781. "/<sup>(.+?)<\/sup>/i" => "[super:\\1]", // replace '<sup>...</sup>' with refbase markup ('[super:...]')
  782. "/<sub>(.+?)<\/sub>/i" => "[sub:\\1]", // replace '<sub>...</sub>' with refbase markup ('[sub:...]')
  783. "/<i>(.+?)<\/i>/i" => "_\\1_", // replace '<i>...</i>' with refbase markup ('_..._')
  784. "/<b>(.+?)<\/b>/i" => "**\\1**", // replace '<b>...</b>' with refbase markup ('**...**')
  785. "/\\x10(.+?)\\x11/" => "_\\1_" // replace '<ASCII#10>...<ASCII#11>' (which is used by Reference Manager to indicate italic strings) with refbase markup ('_..._')
  786. )
  787. ),
  788. array(
  789. 'fields' => array("title", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert RefWorks font attributes (which RefWorks supports in title fields, notes, abstracts and user 1 - 5 fields)
  790. 'actions' => array(
  791. "/0RW1S34RfeSDcfkexd09rT3(.+?)1RW1S34RfeSDcfkexd09rT3/" => "[super:\\1]", // replace RefWorks indicators for superscript text with refbase markup ('[super:...]')
  792. "/0RW1S34RfeSDcfkexd09rT4(.+?)1RW1S34RfeSDcfkexd09rT4/" => "[sub:\\1]", // replace RefWorks indicators for subscript text with refbase markup ('[sub:...]')
  793. "/0RW1S34RfeSDcfkexd09rT2(.+?)1RW1S34RfeSDcfkexd09rT2/" => "_\\1_", // replace RefWorks indicators for italic text with refbase markup ('_..._')
  794. "/0RW1S34RfeSDcfkexd09rT0(.+?)1RW1S34RfeSDcfkexd09rT0/" => "**\\1**", // replace RefWorks indicators for bold text with refbase markup ('**...**')
  795. "/0RW1S34RfeSDcfkexd09rT1(.+?)1RW1S34RfeSDcfkexd09rT1/" => "__\\1__" // replace RefWorks indicators for underline text with refbase markup ('__...__')
  796. )
  797. )
  798. );
  799. // This array lists patterns which match all RIS tags that must occur within a record to be recognized as valid RIS record:
  800. // (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
  801. // the search patterns MUST include the leading & trailing slashes.)
  802. // "tag display name" => "tag search pattern"
  803. $requiredTagsArray = array(
  804. "TY" => "/^TY - /m"
  805. );
  806. // This array matches RIS tags with their corresponding refbase fields:
  807. // (fields that are unsupported in either RIS or refbase are commented out)
  808. // "RIS tag" => "refbase field" // RIS tag name (comment)
  809. $tagsToRefbaseFieldsArray = array(
  810. "TY" => "type", // Type of reference (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
  811. "AU" => "author", // Author Primary
  812. "A1" => "author", // Author Primary
  813. "A2" => array("CONF" => "conference", "CPAPER" => "conference", "Other" => "editor"), // Author Secondary (see note for 'corporate_author' below)
  814. "ED" => "editor", // Author Secondary
  815. "A3" => "series_editor", // Author Series
  816. // "A4" => "", // Subsidary Authors
  817. "AD" => "address", // Address
  818. // "" => "corporate_author", // note that bibutils uses the RIS 'A2' tag to indicate conference titles ('<name type="conference">') and corporate authors ('<name type="corporate">', e.g., when importing contents of the BibTeX 'organization' field)
  819. // "AN" => "", // Accession Number
  820. // "CA" => "", // Caption
  821. // "LB" => "", // Label
  822. "TI" => "title", // Title Primary
  823. "T1" => "title", // Title Primary
  824. "CT" => "title", // Title Primary
  825. // "" => "orig_title",
  826. "PY" => "year", // RIS Spec 2009: Publication Year (four digits); RIS Spec 2004: Date Primary (date must be in the following format: "YYYY/MM/DD/other_info"; the year, month and day fields are all numeric; the other info field can be any string of letters, spaces and hyphens; note that each specific date information is optional, however the slashes ("/") are not)
  827. "Y1" => "year", // Date Primary (same syntax rules as for "PY", RIS Spec 2004)
  828. // "Y2" => "", // Date Secondary (same syntax rules as for "PY", RIS Spec 2004)
  829. "DA" => "year", // RIS Spec 2009: Date (same syntax rules as for "PY", RIS Spec 2004)
  830. "BT" => array("BOOK" => "series_title", "STD" => "series_title", "THES" => "series_title", "Other" => "publication"), // according to <http://www.refman.com/support/risformat_tags_01.asp> this would be: array("BOOK" => "title", "Other" => "publication"), // Book Whole: Title Primary; Other reference types: Title Secondary
  831. "JF" => "publication", // Periodical name: full format
  832. "JO" => "publication", // Periodical name: full format
  833. "JA" => "abbrev_journal", // Periodical name: standard abbreviation
  834. "J1" => "abbrev_journal", // Periodical name: user abbreviation 1
  835. "J2" => "abbrev_journal", // Periodical name: user abbreviation 2
  836. "T2" => array("JOUR" => "abbrev_journal", "Other" => "abbrev_series_title"), // Title Secondary (note that "T2" is used by bibutils (instead of "JA") for abbreviated journal names and abbreviated series titles)
  837. "T3" => "series_title", // Title Series (in case of "TY=CONF", "T3" appears to be used for conference title)
  838. "VL" => "volume", // Volume number
  839. "CP" => "issue", // Issue
  840. "IS" => "issue", // Issue
  841. "SP" => "startPage", // Start page number (contents of the special fields 'startPage' and 'endPage' will be merged into a range and copied to the refbase 'pages' field)
  842. "EP" => "endPage", // Ending page number
  843. "LP" => "endPage", // Ending page number ('LP' is actually not part of the RIS specification but gets used in the wild such as in RIS exports of the American Physical Society, <http://aps.org/>)
  844. // "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
  845. // "" => "series_issue",
  846. "PB" => "publisher", // Publisher
  847. "CY" => "place", // City of publication
  848. // "DB" => "", // Database
  849. // "DP" => "", // Database provider
  850. "ET" => "edition",
  851. // "" => "medium",
  852. "SN" => array("BOOK" => "isbn", "CHAP" => "isbn", "STD" => "isbn", "THES" => "isbn", "Other" => "issn"), // Book Whole, Book Chapter, Generic and Thesis: ISBN; Other reference types: ISSN (note that this will fail for a thesis that was published within a series with an ISSN number)
  853. "LA" => "language",
  854. // "" => "summary_language",
  855. "KW" => "keywords", // Keywords
  856. "AB" => "abstract", // Notes (NOTE: by mapping 'AB' to "abstract" we deviate from the RIS spec which treats 'AB' as synonym to 'N1', i.e. notes)
  857. "N2" => "abstract", // Abstract
  858. // "" => "area",
  859. // "" => "expedition",
  860. // "" => "conference",
  861. "DO" => "doi", // DOI (we also recognize a DOI when given in the M3 or UR fields; see below)
  862. "UR" => "url", // URL (URL addresses can be entered individually, one per tag or multiple addresses can be entered on one line using a semi-colon as a separator)
  863. "L1" => "file", // Link to PDF (same syntax rules as for "UR")
  864. // "L2" => "", // Link to Full-text (same syntax rules as for "UR")
  865. // "L3" => "related", // Related Records (this mapping would require some postprocessing of the field value so that it's suitable for the 'related' field)
  866. // "L4" => "", // Image(s)
  867. "N1" => "notes", // Notes
  868. "ID" => "call_number", // Reference ID (NOTE: if no other field gets mapped to the 'cite_key' field, the contents of the 'call_number' field will be also copied to the 'cite_key' field of the currently logged-in user)
  869. // "M1" => "", // Miscellaneous 1
  870. // "M2" => "", // Miscellaneous 2
  871. "M3" => "doi", // Miscellaneous 3 (ISI Web of Science exports the DOI number in the M3 field) but TR specified it should be Type of Work.
  872. "U1" => "thesis", // User definable 1 ('U1' is used by Bibutils to indicate the type of thesis, e.g. "Masters thesis" or "Ph.D. thesis"; function 'parseRecords()' will further tweak the contents of the refbase 'thesis' field)
  873. "U2" => "user_notes", // User definable 2
  874. // "U3" => "", // User definable 3
  875. // "U4" => "", // User definable 4
  876. // "U5" => "", // User definable 5
  877. // "" => "contribution_id",
  878. // "" => "online_publication",
  879. // "" => "online_citation",
  880. // "" => "approved",
  881. // "" => "orig_record",
  882. // "RP" => "copy", // Reprint status (valid values: "IN FILE", "NOT IN FILE", "ON REQUEST (MM/DD/YY)") (this mapping would require some postprocessing of the field value so that it's suitable for the 'copy' field)
  883. // "AV" => "", // Availability
  884. // "C1" => "", // Custom 1
  885. // "C2" => "", // Custom 2
  886. // "C3" => "", // Custom 3
  887. // "C4" => "", // Custom 4
  888. // "C5" => "", // Custom 5
  889. // "C6" => "", // Custom 6
  890. // "C7" => "", // Custom 7
  891. // "C8" => "", // Custom 8
  892. );
  893. // This array lists all RIS tags that may occur multiple times:
  894. $tagsMultipleArray = array(
  895. "AU",
  896. "A1",
  897. "A2",
  898. "ED",
  899. "A3",
  900. "KW",
  901. "UR", // currently, refbase does only support one URL per record (however, we allow 'UR' to occur multiple times to extract any DOI given as URL, otherwise only the first URL will be taken)
  902. // "L1", // currently, refbase does only support one file per record
  903. "N1"
  904. );
  905. // This array matches RIS reference types with their corresponding refbase types:
  906. // (RIS types that are currently not supported in refbase will be taken as is but will get
  907. // prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
  908. // is not a perfect match but as close as currently possible)
  909. // "RIS type" => "refbase type" // name of RIS reference type (comment)
  910. $referenceTypesToRefbaseTypesArray = array(
  911. "ABST" => "Abstract", // Abstract
  912. "ADVS" => "Unsupported: Audiovisual Material", // Audiovisual material
  913. "AGGR" => "Unsupported: Aggregated Database", // Aggregated database
  914. "ANCIENT" => "Unsupported: Ancient Text", // Ancient text
  915. "ART" => "Unsupported: Art Work", // Art work
  916. "BILL" => "Unsupported: Bill/Resolution", // Bill/Resolution
  917. "BLOG" => "Unsupported: Blog", // Blog
  918. "BOOK" => "Book Whole", // Book, Whole
  919. "CASE" => "Unsupported: Case", // Case
  920. "CHAP(TER)?" => "Book Chapter", // Book chapter (the incorrect CHAPTER type gets used by SpringerLink, see e.g. RIS output at <http://www.springerlink.com/content/57w5dd51eh0h8a25>)
  921. "CHART" => "Unsupported: Chart", // Chart
  922. "CLSWK" => "Unsupported: Classical Work", // Classical work
  923. "COMP" => "Software", // Computer program
  924. "CONF" => "Conference Article", // Conference proceeding
  925. "CPAPER" => "Conference Article", // Conference paper
  926. "CTLG" => "Book Whole", // Catalog (#fallback#)
  927. "DATA" => "Unsupported: Data File", // Data file
  928. "DBASE" => "Unsupported: Online Database", // Online database
  929. "DICT" => "Book Whole", // Dictionary (#fallback#)
  930. "EBOOK" => "Book Whole", // Electronic book
  931. "ECHAP" => "Book Chapter", // Electronic book section
  932. "EDBOOK" => "Book Whole", // Edited book
  933. "EJOUR" => "Journal Article", // Electronic article
  934. "ELEC" => "Unsupported: Electronic Citation", // Electronic Citation
  935. "ENCYC" => "Book Whole", // Encyclopedia (#fallback#)
  936. "EQUA" => "Unsupported: Equation", // Equation
  937. "FIGURE" => "Unsupported: Figure", // Firure
  938. "GEN" => "Miscellaneous", // Generic
  939. "GOVDOC" => "Unsupported: Government Document", // Government document
  940. "GRNT" => "Unsupported: Grant", // Grant
  941. "HEAR" => "Unsupported: Hearing", // Hearing
  942. "ICOMM" => "Unsupported: Internet Communication", // Internet Communication
  943. "INPR" => "Journal Article", // In Press (#fallback#)
  944. "JFULL" => "Journal", // Journal (full)
  945. "JOUR" => "Journal Article", // Journal
  946. "LEGAL" => "Unsupported: Legal Rule", // Legal rule
  947. "MAP" => "Map", // Map
  948. "MANSCPT" => "Manuscript", // Manuscript
  949. "MGZN" => "Magazine Article", // Magazine article
  950. "MPCT" => "Unsupported: Motion Picture", // Motion picture
  951. "MULTI" => "Unsupported: Multimedia", // Multimedia
  952. "MUSIC" => "Unsupported: Music Score", // Music score
  953. "NEWS" => "Newspaper Article", // Newspaper
  954. "PAMP" => "Unsupported: Pamphlet", // Pamphlet
  955. "PAT" => "Patent", // Patent
  956. "PCOMM" => "Unsupported: Personal Communication", // Personal communication
  957. "RPRT" => "Report", // Report
  958. "SER" => "Unsupported: Serial (Book, Monograph)", // Serial (Book, Monograph)
  959. "SLIDE" => "Unsupported: Slide", // Slide
  960. "SOUND" => "Unsupported: Sound Recording", // Sound recording
  961. "STAND" => "Miscellaneous", // Standard (#fallback#) due to STD handling
  962. "STAT" => "Unsupported: Statute", // Statute
  963. "STD" => "Miscellaneous", // Generic (note that 'STD' is used by bibutils although it is NOT listed as a recognized reference type at <http://www.refman.com/support/risformat_reftypes.asp>)
  964. "THES" => "Thesis", // Thesis/Dissertation (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
  965. "UNBILL" => "Unsupported: Unenacted Bill/Resolution", // Unenacted bill/resolution
  966. "UNPB" => "Manuscript", // Unpublished work (#fallback#)
  967. "VIDEO" => "Unsupported: Video Recording" // Video recording
  968. );
  969. // -----------------------------------------
  970. // Split input text into individual records:
  971. $recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "ER" (= end of record) tag that terminates every RIS record
  972. // Validate all records that shall be imported:
  973. list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
  974. // Parse all records that shall be imported:
  975. list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "RIS", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
  976. // Build refbase import array:
  977. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  978. "1.0", // 'version' - the version of the given array structure
  979. "http://refbase.net/import/ris/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  980. "Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
  981. "refbase@extracts.de", // 'contact' - author's email/contact address
  982. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  983. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  984. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  985. }
  986. // --------------------------------------------------------------------
  987. // ENDNOTE TO REFBASE
  988. // This function converts records from Endnote tagged (Endnote Refer) format into the standard "refbase"
  989. // array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
  990. function endnoteToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
  991. {
  992. global $contentTypeCharset; // defined in 'ini.inc.php'
  993. global $errors;
  994. global $showSource;
  995. // Define regular expression patterns that will facilitate parsing of Endnote Refer data:
  996. // (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
  997. // Pattern by which the input text will be split into individual records:
  998. $recordDelimiter = "\s*(\r\n|\r(?!\n)|(?<!\r)\n){2,}\s*(?=%\S )";
  999. // Pattern by which records will be split into individual fields:
  1000. $fieldDelimiter = "(\r\n|\r(?!\n)|(?<!\r)\n)+(?=%\S )";
  1001. // Pattern by which fields will be split into their field label (tag) and field data:
  1002. $dataDelimiter = "(?<=^%\S) ";
  1003. // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
  1004. // (Notes: - name standardization occurs after multiple author fields have been merged by '; '
  1005. // - the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1006. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1007. $personDelimiter = "/ *; */";
  1008. // Pattern by which a person's family name is separated from the given name (or initials):
  1009. // (the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1010. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1011. $familyNameGivenNameDelimiter = "/ *, */";
  1012. // Specifies whether the person's family name comes first within a person's name
  1013. // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
  1014. $familyNameFirst = true;
  1015. // Specifies whether a person's full given name(s) shall be shortened to initial(s):
  1016. // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
  1017. // - if set to 'false', given names (and any initials) are taken as is
  1018. // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
  1019. $shortenGivenNames = true;
  1020. // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
  1021. $transformCase = true;
  1022. // Preprocessor actions:
  1023. // Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
  1024. // (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
  1025. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1026. // "/Search Pattern/" => "Replace Pattern"
  1027. $preprocessorActionsArray = array(
  1028. array(
  1029. 'match' => "/&#?\w+;/", // if HTML encoded text (such as "&auml;", "&#xF6;" or "&#233;") occurs in the source data
  1030. 'actions' => array(
  1031. "/(&#?\w+;)/e" => "html_entity_decode('\\1', ENT_QUOTES, '$contentTypeCharset')" // HTML decode source data (see <http://www.php.net/manual/en/function.html-entity-decode.php>)
  1032. )
  1033. )
  1034. );
  1035. // Postprocessor actions:
  1036. // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
  1037. // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
  1038. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1039. // "/Search Pattern/" => "Replace Pattern"
  1040. $postprocessorActionsArray = array(
  1041. array(
  1042. 'fields' => array("year"),
  1043. 'actions' => array(
  1044. "/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
  1045. )
  1046. ),
  1047. array(
  1048. 'fields' => array("title"),
  1049. 'actions' => array(
  1050. "/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
  1051. )
  1052. ),
  1053. array(
  1054. 'fields' => array("notes"),
  1055. 'actions' => array(
  1056. "/(; ?)?exported from refbase \(http[^ ]+ last updated.+?\d{2}:\d{2}:\d{2} [+-]\d{4}/" => "" // remove refbase attribution string (such as "exported from refbase (http://localhost/refs/show.php?record=12345), last updated on Sat, 15 Jul 2006 22:24:16 +0200")
  1057. )
  1058. ),
  1059. array(
  1060. 'fields' => array("url"),
  1061. 'actions' => array(
  1062. "/^PM:(\d+)$/i" => "http://www.ncbi.nlm.nih.gov/pubmed/\\1" // convert "PM:17302433" into a resolvable PubMed URL; Bibutils 'xml2ris' (<= v3.40) converts "<identifier type="pubmed">17302433</identifier>" to "UR - PM:17302433"
  1063. )
  1064. ),
  1065. array(
  1066. 'fields' => array("doi"),
  1067. 'actions' => array(
  1068. "#^(?:.*?)(?=10\.\d{4}/\S+?)|^(?!10\.\d{4}/\S+?).+$#" => "", // remove any text before the DOI (this also deletes the field's contents if it doesn't contain a DOI at all)
  1069. "#(10\.\d{4}/\S+?)(?=$|\s).*$#" => "\\1" // remove any text after the DOI
  1070. )
  1071. ),
  1072. array(
  1073. 'fields' => array("title", "address", "keywords", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert font attributes (which publishers might have included in Endnote Refer records that are available on their web pages)
  1074. 'actions' => array(
  1075. "/<sup>(.+?)<\/sup>/i" => "[super:\\1]", // replace '<sup>...</sup>' with refbase markup ('[super:...]')
  1076. "/<sub>(.+?)<\/sub>/i" => "[sub:\\1]", // replace '<sub>...</sub>' with refbase markup ('[sub:...]')
  1077. "/<i>(.+?)<\/i>/i" => "_\\1_", // replace '<i>...</i>' with refbase markup ('_..._')
  1078. "/<b>(.+?)<\/b>/i" => "**\\1**", // replace '<b>...</b>' with refbase markup ('**...**')
  1079. "/\\x10(.+?)\\x11/" => "_\\1_" // replace '<ASCII#10>...<ASCII#11>' (which is used by Reference Manager to indicate italic strings) with refbase markup ('_..._')
  1080. )
  1081. ),
  1082. array(
  1083. 'fields' => array("title", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert RefWorks font attributes (which RefWorks supports in title fields, notes, abstracts and user 1 - 5 fields)
  1084. 'actions' => array(
  1085. "/0RW1S34RfeSDcfkexd09rT3(.+?)1RW1S34RfeSDcfkexd09rT3/" => "[super:\\1]", // replace RefWorks indicators for superscript text with refbase markup ('[super:...]')
  1086. "/0RW1S34RfeSDcfkexd09rT4(.+?)1RW1S34RfeSDcfkexd09rT4/" => "[sub:\\1]", // replace RefWorks indicators for subscript text with refbase markup ('[sub:...]')
  1087. "/0RW1S34RfeSDcfkexd09rT2(.+?)1RW1S34RfeSDcfkexd09rT2/" => "_\\1_", // replace RefWorks indicators for italic text with refbase markup ('_..._')
  1088. "/0RW1S34RfeSDcfkexd09rT0(.+?)1RW1S34RfeSDcfkexd09rT0/" => "**\\1**", // replace RefWorks indicators for bold text with refbase markup ('**...**')
  1089. "/0RW1S34RfeSDcfkexd09rT1(.+?)1RW1S34RfeSDcfkexd09rT1/" => "__\\1__" // replace RefWorks indicators for underline text with refbase markup ('__...__')
  1090. )
  1091. )
  1092. );
  1093. // This array lists patterns which match all Endnote Refer tags that must occur within a record to be recognized as valid Endnote Refer record:
  1094. // (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
  1095. // the search patterns MUST include the leading & trailing slashes.)
  1096. // "tag display name" => "tag search pattern"
  1097. $requiredTagsArray = array(
  1098. "%0" => "/^%0 /m"
  1099. );
  1100. // This array matches Endnote Refer tags with their corresponding refbase fields:
  1101. // (fields that are unsupported in either Endnote Refer or refbase are commented out)
  1102. // "Endnote Refer tag" => "refbase field" // Endnote Refer tag name (comment)
  1103. $tagsToRefbaseFieldsArray = array(
  1104. "%0" => "type", // Reference Type (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
  1105. "%A" => "author", // Author (Primary Author)
  1106. "%E" => "editor", // Editor (Secondary Author)
  1107. "%Y" => "series_editor", // Tertiary Author (Series Author)
  1108. "%+" => "address", // Address
  1109. // "" => "corporate_author", // Corporate Author (in the original Refer format '%Q' was referring to the corporate author)
  1110. "%T" => "title", // Title (Primary Title)
  1111. // "" => "orig_title",
  1112. "%D" => "year", // Year (Primary Date; the year should be specified in full, and the month name rather than number should be used)
  1113. "%B" => array("Journal Article" => "publication", "Book Section" => "publication", "Conference Proceedings" => "publication", "Magazine Article" => "publication", "Newspaper Article" => "publication", "Other" => "series_title"), // Secondary Title (of a Book or Conference Name); refbase "in-container" types: Secondary Title; Other reference types: Tertiary Title (Series Title)
  1114. "%J" => "publication", // Secondary Title (Journal Name / Periodical Name)
  1115. // "" => "abbrev_journal", // Periodical Name: Standard Abbreviation (Endnote Refer doesn't seem to distinguish between full format & abbreviated formats)
  1116. "%S" => "series_title", // Tertiary Title (Series Title)
  1117. "%V" => "volume", // Volume
  1118. "%N" => "issue", // Number (Issue)
  1119. "%P" => "pages", // Pages
  1120. // "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
  1121. // "" => "series_issue",
  1122. "%I" => "publisher", // Publisher
  1123. "%C" => "place", // Place Published
  1124. "%7" => "edition", // Edition
  1125. // "" => "medium",
  1126. "%@" => array("Book" => "isbn", "Book Section" => "isbn", "Edited Book" => "isbn", "Classical Work" => "isbn", "Generic" => "isbn", "Thesis" => "isbn", "Other" => "issn"), // ISBN/ISSN; Book Whole, Book Chapter, Generic and Thesis: ISBN; Other reference types: ISSN (note that this will fail for a thesis that was published within a series with an ISSN number)
  1127. "%G" => "language", // Language
  1128. // "" => "summary_language",
  1129. "%K" => "keywords", // Keywords
  1130. "%X" => "abstract", // Abstract
  1131. // "" => "area",
  1132. // "" => "expedition",
  1133. // "" => "conference", // see '%8' below
  1134. // "" => "doi", // DOI
  1135. "%U" => "url", // URL
  1136. "%>" => "file", // Link to PDF
  1137. // "" => "related", // Related Records (this mapping would require some postprocessing of the field value so that it's suitable for the 'related' field)
  1138. "%O" => "notes", // Notes (Bibutils uses '%O' for notes instead!)
  1139. "%Z" => "notes", // Notes
  1140. "%F" => "cite_key", // Label (Reference ID)
  1141. "%9" => "thesis", // Type of Work (how the entry was published; for reports, this would be the report type, and for theses, the thesis type (e.g. "Masters thesis" or "Ph.D. thesis"); function 'parseRecords()' will further tweak the contents of the refbase 'thesis' field; the original Refer format seems to use '%R' for report, paper, or thesis type)
  1142. // "%H" => "", // Translated Author (in the original Refer format '%H' was referring to the "Header commentary which is printed before the reference")
  1143. "%L" => "call_number", // Call Number (NOTE: if no other field gets mapped to the 'cite_key' field, the contents of the 'call_number' field will be also copied to the 'cite_key' field of the currently logged-in user)
  1144. "%8" => array("Conference Proceedings" => "conference", "Other" => "notes"), // Date (date associated with entry; for conference proceedings, this would be the date of the conference)
  1145. // "" => "contribution_id",
  1146. // "" => "online_publication",
  1147. // "" => "online_citation",
  1148. // "" => "approved",
  1149. // "" => "orig_record",
  1150. // "" => "copy", // Reprint status (this mapping would require some postprocessing of the field value so that it's suitable for the 'copy' field)
  1151. // "%M" => "", // Accession Number
  1152. // "%Q" => "", // Translated Title (in the original Refer format '%Q' was referring to the corporate author)
  1153. // "%R" => "", // Electronic Resource Number
  1154. // "%W" => "", // Database Provider
  1155. // "%1" => "", // Custom 1
  1156. // "%2" => "", // Custom 2
  1157. // "%3" => "", // Custom 3
  1158. // "%4" => "", // Custom 4
  1159. // "%6" => "", // Number of Volumes
  1160. // "%?" => "", // Subsidiary Author
  1161. // "%!" => "", // Short Title
  1162. // "%#" => "", // Custom 5
  1163. // "%$" => "", // Custom 6
  1164. // "%]" => "", // Custom 7
  1165. // "%&" => "", // Section
  1166. // "%(" => "", // Original Publication (date)
  1167. // "%)" => "", // Reprint Edition (date)
  1168. // "%*" => "", // Reviewed Item
  1169. // "%^" => "", // Caption
  1170. // "%<" => "", // Research Notes
  1171. // "%[" => "", // Access Date
  1172. // "%=" => "", // Last Modified Date
  1173. // "%~" => "", // Name of Database
  1174. );
  1175. // This array lists all Endnote Refer tags that may occur multiple times:
  1176. $tagsMultipleArray = array(
  1177. "%A",
  1178. "%E",
  1179. "%I",
  1180. "%K",
  1181. "%O",
  1182. "%Y",
  1183. "%Z",
  1184. "%@"
  1185. );
  1186. // This array matches Endnote Refer reference types with their corresponding refbase types:
  1187. // (Endnote Refer types that are currently not supported in refbase will be taken as is but will get
  1188. // prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
  1189. // is not a perfect match but as close as currently possible)
  1190. // "Endnote Refer type" => "refbase type" // comment
  1191. $referenceTypesToRefbaseTypesArray = array(
  1192. // "" => "Abstract",
  1193. "Aggregated Database" => "Unsupported: Aggregated Database", // EN X2
  1194. "Ancient Text" => "Unsupported: Ancient Text", // EN X
  1195. "Artwork" => "Unsupported: Artwork",
  1196. "Audiovisual Material" => "Unsupported: Audiovisual Material",
  1197. "Bill" => "Unsupported: Bill",
  1198. "Blog" => "Unsupported: Blog", // EN X2
  1199. "^Book$" => "Book Whole", // without the Regex anchors "Book Section" would get renamed incorrectly as "Book Whole Section"
  1200. "Book Section" => "Book Chapter",
  1201. "Case" => "Unsupported: Case",
  1202. "Catalog" => "Unsupported: Catalog", // EN X2
  1203. "Chart or Table" => "Unsupported: Chart or Table",
  1204. "Classical Work" => "Book Whole", // #fallback# // EN 8 (Classical Works)
  1205. "Computer Program" => "Software",
  1206. "Conference Paper" => "Conference Article", // EN 8
  1207. "Conference Proceedings" => "Conference Volume",
  1208. "Dictionary" => "Unsupported: Dictionary", // EN X
  1209. "Edited Book" => "Book Whole",
  1210. "Electronic Article" => "Journal Article", // #fallback# // EN 8 (Electronic Journal); renamed in EN 9 (was: Electronic Journal)
  1211. "Electronic Book" => "Book Whole", // #fallback# // EN 8
  1212. "Encyclopedia" => "Unsupported: Encyclopedia", // EN X
  1213. "Equation" => "Unsupported: Equation",
  1214. "Figure" => "Unsupported: Figure",
  1215. "Film or Broadcast" => "Unsupported: Film or Broadcast",
  1216. "Generic" => "Miscellaneous",
  1217. "Government Document" => "Report", // #fallback# // EN 8 (Government Report or Document)
  1218. "Grant" => "Unsupported: Grant", // EN X
  1219. "Hearing" => "Unsupported: Hearing",
  1220. "Journal Article" => "Journal Article",
  1221. "Legal Rule or Regulation" => "Unsupported: Legal Rule or Regulation", // EN 8 (Legal Rule/Regulation)
  1222. "Magazine Article" => "Magazine Article",
  1223. "Manuscript" => "Manuscript",
  1224. "Map" => "Map",
  1225. "Newspaper Article" => "Newspaper Article",
  1226. "Online Database" => "Unsupported: Online Database", // EN 8
  1227. "Online Multimedia" => "Unsupported: Online Multimedia", // EN 8
  1228. "Pamphlet" => "Unsupported: Pamphlet", // EN X2
  1229. "Patent" => "Patent",
  1230. "Personal Communication" => "Unsupported: Personal Communication",
  1231. "Report" => "Report",
  1232. "Serial" => "Unsupported: Serial", // EN X2
  1233. "Standard" => "Unsupported: Standard", // EN X2
  1234. "Statute" => "Unsupported: Statute",
  1235. "Thesis" => "Thesis", // function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field
  1236. "Unpublished Work" => "Manuscript", // #fallback# // EN 8
  1237. "Web Page" => "Unsupported: Web Page", // renamed in EN X (was: Electronic Source)
  1238. "Unused 1" => "Unsupported: Unused 1",
  1239. "Unused 2" => "Unsupported: Unused 2",
  1240. "Unused 3" => "Unsupported: Unused 3"
  1241. );
  1242. // -----------------------------------------
  1243. // Split input text into individual records:
  1244. $recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the blank line that delimites Endnote Refer records
  1245. // Validate all records that shall be imported:
  1246. list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
  1247. // Parse all records that shall be imported:
  1248. list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "Endnote", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
  1249. // Build refbase import array:
  1250. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  1251. "1.0", // 'version' - the version of the given array structure
  1252. "http://refbase.net/import/endnote-refer/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  1253. "Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
  1254. "refbase@extracts.de", // 'contact' - author's email/contact address
  1255. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  1256. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  1257. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  1258. }
  1259. // --------------------------------------------------------------------
  1260. // MEDLINE TO REFBASE
  1261. // This function converts records from Pubmed MEDLINE format into the standard "refbase"
  1262. // array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
  1263. function medlineToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
  1264. {
  1265. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  1266. global $errors;
  1267. global $showSource;
  1268. // Define regular expression patterns that will facilitate parsing of MEDLINE data:
  1269. // (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
  1270. // Pattern by which the input text will be split into individual records:
  1271. $recordDelimiter = "\s*[\r\n](?=PMID- |<html>)"; // PubMed error messages are wrapped into HTML (errors may occur e.g. when fetching MEDLINE data directly via their PubMed ID)
  1272. // Pattern by which records will be split into individual fields:
  1273. $fieldDelimiter = "[\r\n]+(?=\w{2,4} *- )";
  1274. // Pattern by which fields will be split into their field label (tag) and field data:
  1275. $dataDelimiter = "(?<=^\w{2}) - |(?<=^\w{3}) - |(?<=^\w{4})- ";
  1276. // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
  1277. // (Notes: - name standardization occurs after multiple author fields have been merged by '; '
  1278. // - the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1279. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1280. $personDelimiter = "/ *; */";
  1281. // Pattern by which a person's family name is separated from the given name (or initials):
  1282. // (the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1283. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1284. $familyNameGivenNameDelimiter = "/ *, */";
  1285. // Specifies whether the person's family name comes first within a person's name
  1286. // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
  1287. $familyNameFirst = true;
  1288. // Specifies whether a person's full given name(s) shall be shortened to initial(s):
  1289. // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
  1290. // - if set to 'false', given names (and any initials) are taken as is
  1291. // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
  1292. $shortenGivenNames = true;
  1293. // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
  1294. $transformCase = true;
  1295. // Preprocessor actions:
  1296. // Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
  1297. // (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
  1298. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1299. // "/Search Pattern/" => "Replace Pattern"
  1300. $preprocessorActionsArray = array(
  1301. array(
  1302. 'match' => "/^FAU - .+?[\r\n]AU - /m", // if author info is available via both 'FAU' *AND* 'AU' field(s)
  1303. 'actions' => array(
  1304. "/^AU - .+?[\r\n]+/m" => "" // discard any 'AU' field(s) (which otherwise would confuse the 'parseRecords()' function)
  1305. )
  1306. ),
  1307. array(
  1308. 'match' => "/^AU - /m",
  1309. 'actions' => array(
  1310. "/(?<=^AU - )([$alpha -]+) +([$upper]+)/m$patternModifiers" => "\\1, \\2" // change the string formatting in 'AU' field(s) to the one used by refbase (i.e. insert a comma between family name & initials)
  1311. )
  1312. )
  1313. );
  1314. // Postprocessor actions:
  1315. // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
  1316. // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
  1317. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1318. // "/Search Pattern/" => "Replace Pattern"
  1319. $postprocessorActionsArray = array(
  1320. array(
  1321. 'fields' => array("year"),
  1322. 'actions' => array(
  1323. "/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
  1324. )
  1325. ),
  1326. array(
  1327. 'fields' => array("title", "orig_title", "publication", "address"),
  1328. 'actions' => array(
  1329. "/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
  1330. )
  1331. ),
  1332. array(
  1333. 'fields' => array("publication", "abbrev_journal"), // NOTE: this replacement action will probably be only beneficial for records of type "Journal Article" (if possible, this should rather be a preprocessor action to distinguish articles from books or other resource types)
  1334. 'actions' => array(
  1335. "/\b([$lower])([$alpha]{3,})/e$patternModifiers" => "strtoupper('\\1').'\\2'" // make sure that all journal title words (with >3 characters) start with an upper case letter (the 'e' modifier allows to execute PHP code within the replacement pattern)
  1336. )
  1337. ),
  1338. array(
  1339. 'fields' => array("issn"),
  1340. 'actions' => array(
  1341. "/^.*?(\w{4}-?\w{4}).*/" => "\\1" // remove any text except the actual ISSN number
  1342. )
  1343. ),
  1344. array(
  1345. 'fields' => array("notes"),
  1346. 'actions' => array(
  1347. "/^(\d+)/" => "PMID:\\1", // insert a "PMID:" prefix in front of any number that's at the beginning of the notes field
  1348. "/Pmc(\d+)/" => "PMCID:PMC\\1" // insert a "PMCID:" prefix in front of any number that is prefixed with "Pmc"
  1349. )
  1350. ),
  1351. array(
  1352. 'fields' => array("doi"),
  1353. 'actions' => array(
  1354. "/^.*?([^ ]+) *\[doi\].*/" => "\\1", // if a DOI number is given, extract the DOI and discard everything else
  1355. "/^.*?\[[^]]+\].*/" => "" // if no DOI number was given but some other ID info is still present, remove everything from the field
  1356. )
  1357. ),
  1358. array(
  1359. 'fields' => array("language", "summary_language"),
  1360. 'actions' => array(
  1361. "/^afr$/i" => "Afrikaans", // map abbreviated language names to full names (taken from <http://www.nlm.nih.gov/bsd/language_table.html>)
  1362. "/^alb$/i" => "Albanian",
  1363. "/^amh$/i" => "Amharic",
  1364. "/^ara$/i" => "Arabic",
  1365. "/^arm$/i" => "Armenian",
  1366. "/^aze$/i" => "Azerbaijani",
  1367. "/^ben$/i" => "Bengali",
  1368. "/^bos$/i" => "Bosnian",
  1369. "/^bul$/i" => "Bulgarian",
  1370. "/^cat$/i" => "Catalan",
  1371. "/^chi$/i" => "Chinese",
  1372. "/^cze$/i" => "Czech",
  1373. "/^dan$/i" => "Danish",
  1374. "/^dut$/i" => "Dutch",
  1375. "/^eng$/i" => "English",
  1376. "/^epo$/i" => "Esperanto",
  1377. "/^est$/i" => "Estonian",
  1378. "/^fin$/i" => "Finnish",
  1379. "/^fre$/i" => "French",
  1380. "/^geo$/i" => "Georgian",
  1381. "/^ger$/i" => "German",
  1382. "/^gla$/i" => "Scottish Gaelic",
  1383. "/^gre$/i" => "Greek, Modern",
  1384. "/^heb$/i" => "Hebrew",
  1385. "/^hin$/i" => "Hindi",
  1386. "/^hun$/i" => "Hungarian",
  1387. "/^ice$/i" => "Icelandic",
  1388. "/^ind$/i" => "Indonesian",
  1389. "/^ita$/i" => "Italian",
  1390. "/^jpn$/i" => "Japanese",
  1391. "/^kin$/i" => "Kinyarwanda",
  1392. "/^kor$/i" => "Korean",
  1393. "/^lat$/i" => "Latin",
  1394. "/^lav$/i" => "Latvian",
  1395. "/^lit$/i" => "Lithuanian",
  1396. "/^mac$/i" => "Macedonian",
  1397. "/^mal$/i" => "Malayalam",
  1398. "/^mao$/i" => "Maori",
  1399. "/^may$/i" => "Malay",
  1400. "/^mul$/i" => "Multiple languages",
  1401. "/^nor$/i" => "Norwegian",
  1402. "/^per$/i" => "Persian",
  1403. "/^pol$/i" => "Polish",
  1404. "/^por$/i" => "Portuguese",
  1405. "/^pus$/i" => "Pushto",
  1406. "/^rum$/i" => "Romanian, Rumanian",
  1407. "/^rus$/i" => "Russian",
  1408. "/^san$/i" => "Sanskrit",
  1409. "/^scc$/i" => "Serbian",
  1410. "/^scr$/i" => "Croatian",
  1411. "/^slo$/i" => "Slovak",
  1412. "/^slv$/i" => "Slovenian",
  1413. "/^spa$/i" => "Spanish",
  1414. "/^swe$/i" => "Swedish",
  1415. "/^tha$/i" => "Thai",
  1416. "/^tur$/i" => "Turkish",
  1417. "/^ukr$/i" => "Ukrainian",
  1418. "/^und$/i" => "Undetermined",
  1419. "/^urd$/i" => "Urdu",
  1420. "/^vie$/i" => "Vietnamese",
  1421. "/^wel$/i" => "Welsh"
  1422. )
  1423. )
  1424. );
  1425. // This array lists patterns which match all MEDLINE tags that must occur within a record to be recognized as valid MEDLINE record:
  1426. // (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
  1427. // the search patterns MUST include the leading & trailing slashes.)
  1428. // "tag display name" => "tag search pattern"
  1429. $requiredTagsArray = array(
  1430. "PMID" => "/^PMID- /m",
  1431. "PT" => "/^PT - /m"
  1432. );
  1433. // This array matches MEDLINE tags with their corresponding refbase fields:
  1434. // (MEDLINE fields taken from <http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helppubmed.table.pubmedhelp.T43>;
  1435. // fields that are unsupported in either MEDLINE or refbase are commented out)
  1436. // "MEDLINE tag" => "refbase field" // MEDLINE tag name [Description] (comment)
  1437. $tagsToRefbaseFieldsArray = array(
  1438. "PT" => "type", // Publication Type [The type of material the article represents] (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
  1439. "AU" => "author", // Author [Authors] (the contents of the 'AU' field will be used if the 'FAU' field is not available; note that for records that contain both 'AU' *AND* 'FAU' fields, this only works if a suitable preprocessor action is defined, see above)
  1440. "FAU" => "author", // Full Author Name [Full Author Names] (by default, we use this author format since family name and initials are uniquely separated by a comma)
  1441. // "" => "editor",
  1442. // "" => "series_editor",
  1443. "AD" => "address", // Affiliation [Institutional affiliation and address of the first author]
  1444. "CN" => "corporate_author", // Corporate Author [Corporate author or group names with authorship responsibility]
  1445. "TI" => "title", // Title [The title of the article]
  1446. "TT" => "orig_title", // Transliterated Title [Title of the article originally published in a non-English language, in that language]
  1447. "DP" => "year", // Publication Date [The date the article was published]
  1448. // "DEP" => "", // Date of Electronic Publication [Electronic publication date]
  1449. "JT" => "publication", // Full Journal Title [Full journal title from NLM's cataloging data]
  1450. "TA" => "abbrev_journal", // Journal Title Abbreviation [Standard journal title abbreviation]
  1451. // "" => "series_title",
  1452. "VI" => "volume", // Volume [Volume number of the journal]
  1453. "IP" => "issue", // Issue [The number of the issue, part, or supplement of the journal in which the article was published]
  1454. "PG" => "pages", // Pagination [The full pagination of the article]
  1455. // "" => "series_volume",
  1456. // "" => "series_issue",
  1457. // "" => "edition",
  1458. // "" => "medium",
  1459. // "" => "isbn",
  1460. "IS" => "issn", // ISSN [International Standard Serial Number of the journal]
  1461. // "" => "publisher",
  1462. // "PL" => "place", // Place of Publication [Journal's country of publication] (the "PL" field lists the *country* of publication but the *city* of publication should go into the "place" field)
  1463. "LA" => "language", // Language [The language in which the article was published]
  1464. // "" => "summary_language",
  1465. "MH" => "keywords", // MeSH Terms [NLM's Medical Subject Headings (MeSH) controlled vocabulary]
  1466. "OT" => "keywords", // Other Term [Non-MeSH subject terms (keywords) assigned by an organization identified by the Other Term Owner]
  1467. "OAB" => "abstract", // Other Abstract [Abstract supplied by an NLM collaborating organization] (since "AB" is defined later and neither "OAB" nor "AB" is listed in '$tagsMultipleArray', any content in "AB" will overwrite contents of "OAB")
  1468. "AB" => "abstract", // Abstract [English language abstract taken directly from the published article]
  1469. // "" => "area",
  1470. // "" => "expedition",
  1471. // "" => "conference",
  1472. "AID" => "doi", // Article Identifier [Article ID values supplied by the publisher may include the pii (controlled publisher identifier) or doi (Digital Object Identifier)] (using a search & replace action, we'll extract only the doi bit)
  1473. // "" => "url",
  1474. // "" => "file",
  1475. "GN" => "notes", // General Note [Supplemental or descriptive information related to the document]
  1476. "PMID" => "notes", // PubMed Unique Identifier [Unique number assigned to each PubMed citation]
  1477. "PMC" => "notes", // PubMed Central ID
  1478. // "" => "call_number",
  1479. // "" => "contribution_id",
  1480. // "" => "online_publication",
  1481. // "" => "online_citation",
  1482. // "" => "approved",
  1483. // "" => "orig_record",
  1484. //# "PUBM" => "online_publication", // Publishing Model [Article's model of print or electronic publishing]
  1485. "SO" => "source", // Source [Composite field containing bibliographic information] (the contents of this special field may be presented within the header message of 'record.php' for easy comparison with the extracted data)
  1486. // "CI" => "", // Copyright Information [Copyright statement provided by the publisher]
  1487. // "CIN" => "", // Comment In [Reference containing a comment about the article]
  1488. // "CON" => "", // Comment On [Reference upon which the article comments]
  1489. // "CRF" => "", // Corrected and republished from [Final, correct version of an article]
  1490. // "CRI" => "", // Corrected and republished in [Original article that was republished in corrected form]
  1491. // "DA" => "", // Date Created [Used for internal processing at NLM]
  1492. // "DCOM" => "", // Date Completed [Used for internal processing at NLM]
  1493. // "EDAT" => "", // Entrez Date [The date the citation was added to PubMed]
  1494. // "EFR" => "", // Erratum For [Cites the original article needing the correction]
  1495. // "EIN" => "", // Erratum In [Reference containing a published erratum to the article]
  1496. // "FIR" => "", // Full Investigator [Full investigator name]
  1497. // "FPS" => "", // Full Personal Name as Subject [Full Personal Name of the subject of the article]
  1498. // "GR" => "", // Grant Number [Research grant numbers, contract numbers, or both that designate financial support by any agency of the US PHS or Wellcome Trust]
  1499. // "GS" => "", // Gene Symbol [Abbreviated gene names (used 1991 through 1996)]
  1500. // "IR" => "", // Investigator [NASA-funded principal investigator]
  1501. // "IRAD" => "", // Investigator Affiliation [Affiliation of NASA-funded principal investigator]
  1502. // "JID" => "", // NLM Unique ID [Unique journal ID in NLM's catalog of books, journals, and audiovisuals]
  1503. // "LR" => "", // Last Revision Date [The date a change was made to the record]
  1504. // "MHDA" => "", // MeSH Date [The date MeSH terms were added to the citation. The MeSH date is the same as the Entrez date until MeSH are added]
  1505. // "OCI" => "", // Other Copyright Information [Copyright owner]
  1506. // "OID" => "", // Other ID [Identification numbers provided by organizations supplying citation data]
  1507. // "ORI" => "", // Original Report In [Cites the original article associated with the patient summary]
  1508. // "OTO" => "", // Other Term Owner [Organization that provided the Other Term data]
  1509. // "OWN" => "", // Owner [Organization acronym that supplied citation data]
  1510. // "PHST" => "", // Publication History Status Date [Publisher supplied dates regarding the article publishing process]
  1511. // "PS" => "", // Personal Name as Subject [Individual is the subject of the article]
  1512. // "PST" => "", // Publication Status [Publication status]
  1513. // "RF" => "", // Number of References [Number of bibliographic references for Review articles]
  1514. // "RIN" => "", // Retraction In [Retraction of the article]
  1515. // "RN" => "", // EC/RN Number [Number assigned by the Enzyme Commission to designate a particular enzyme or by the Chemical Abstracts Service for Registry Numbers]
  1516. // "ROF" => "", // Retraction Of [Article being retracted]
  1517. // "RPF" => "", // Republished From [Original article]
  1518. // "RPI" => "", // Republished In [Corrected and republished article]
  1519. // "SB" => "", // Subset [Journal or citation subset values representing specialized topics]
  1520. // "SFM" => "", // Space Flight Mission [NASA-supplied data space flight/mission name and/or number]
  1521. // "SI" => "", // Secondary Source Identifier [Identifies secondary source databanks and accession numbers of molecular sequences discussed in articles]
  1522. // "SPIN" => "", // Summary For Patients In [Cites a patient summary article]
  1523. // "STAT" => "", // Status Tag [Used for internal processing at NLM]
  1524. // "UIN" => "", // Update In [Update to the article]
  1525. // "UOF" => "", // Update Of [The article being updated]
  1526. );
  1527. // This array lists all MEDLINE tags that may occur multiple times:
  1528. $tagsMultipleArray = array(
  1529. "AU", // see above note for 'AU' at '$tagsToRefbaseFieldsArray'
  1530. "FAU",
  1531. "MH",
  1532. "OT",
  1533. "AID",
  1534. "PMID", // by allowing "PMID", "PMC" and "GN" to occur multiple times we can merge the contents of these fields into the 'notes' field
  1535. "PMC",
  1536. "GN"
  1537. );
  1538. // This array matches MEDLINE reference types with their corresponding refbase types:
  1539. // (MEDLINE types that are currently not supported in refbase will be taken as is but will get
  1540. // prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
  1541. // is not a perfect match but as close as currently possible)
  1542. // "MEDLINE type" => "refbase type"
  1543. $referenceTypesToRefbaseTypesArray = array(
  1544. // "Journal Article" => "Journal Article", // NOTE: PubMed has *many* more types which should be dealt with (see e.g. <http://www.nlm.nih.gov/mesh/pubtypes2006.html> and <http://www.nlm.nih.gov/mesh/pubtypesg2003.html>)
  1545. "JOURNAL ARTICLE" => "Journal Article",
  1546. "REVIEW|Review" => "Journal Article", // in some records, "PT" may occur multiple times (e.g. as in "PT - Journal Article\nPT - Review"), and refbase currently uses the contents of the last "PT" as type
  1547. "Monograph|Account Books|Guidebooks|Handbooks|Textbooks" => "Book Whole",
  1548. "Congresses|Meeting Abstracts" => "Conference Article",
  1549. "Consensus Development Conference(, NIH)?" => "Conference Article",
  1550. "Newspaper Article" => "Newspaper Article",
  1551. "(Annual|Case|Technical) Reports?" => "Report",
  1552. "Manuscripts|Unpublished Works" => "Manuscript",
  1553. "Patents" => "Patent",
  1554. "Maps" => "Map",
  1555. "Editorial" => "Journal Article",
  1556. "Letter" => "Journal Article", // #fallback#
  1557. "Validation Studies" => "Journal Article",
  1558. "Research Support, N\.I\.H\., (Ex|In)tramural *" => "Journal Article",
  1559. "Research Support, (Non-)?U\.S\. Gov\'t(, (Non-)?P\.H\.S\.)? *" => "Journal Article"
  1560. );
  1561. // -----------------------------------------
  1562. // Split input text into individual records:
  1563. $recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "ER" (= end of record) tag that terminates every MEDLINE record
  1564. // Validate all records that shall be imported:
  1565. list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
  1566. // Parse all records that shall be imported:
  1567. list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "MEDLINE", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
  1568. // Build refbase import array:
  1569. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  1570. "1.0", // 'version' - the version of the given array structure
  1571. "http://refbase.net/import/medline/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  1572. "Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
  1573. "refbase@extracts.de", // 'contact' - author's email/contact address
  1574. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  1575. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  1576. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  1577. }
  1578. // --------------------------------------------------------------------
  1579. // REFWORKS TO REFBASE
  1580. // This function converts records from RefWorks Tagged Format into the standard "refbase"
  1581. // array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
  1582. // More info on the RefWorks Tagged Format: <http://refworks.scholarsportal.info/Refworks/help/RefWorks_Tagged_Format.htm>
  1583. function refworksToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
  1584. {
  1585. global $errors;
  1586. global $showSource;
  1587. // Define regular expression patterns that will facilitate parsing of RefWorks data:
  1588. // (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
  1589. // Pattern by which the input text will be split into individual records:
  1590. $recordDelimiter = "\s*[\r\n][\r\n][\r\n]+\s*";
  1591. // Pattern by which records will be split into individual fields:
  1592. $fieldDelimiter = "[\r\n]+(?=\w\w )";
  1593. // Pattern by which fields will be split into their field label (tag) and field data:
  1594. $dataDelimiter = "(?<=^\w\w) ";
  1595. // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
  1596. // (Notes: - name standardization occurs after multiple author fields have been merged by '; '
  1597. // - the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1598. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1599. $personDelimiter = "/ *; */";
  1600. // Pattern by which a person's family name is separated from the given name (or initials):
  1601. // (the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1602. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1603. $familyNameGivenNameDelimiter = "/ *, */";
  1604. // Specifies whether the person's family name comes first within a person's name
  1605. // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
  1606. $familyNameFirst = true;
  1607. // Specifies whether a person's full given name(s) shall be shortened to initial(s):
  1608. // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
  1609. // - if set to 'false', given names (and any initials) are taken as is
  1610. // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
  1611. $shortenGivenNames = true;
  1612. // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
  1613. $transformCase = true;
  1614. // Preprocessor actions:
  1615. // Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
  1616. // (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
  1617. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1618. // "/Search Pattern/" => "Replace Pattern"
  1619. $preprocessorActionsArray = array();
  1620. // Postprocessor actions:
  1621. // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
  1622. // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
  1623. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1624. // "/Search Pattern/" => "Replace Pattern"
  1625. $postprocessorActionsArray = array(
  1626. array(
  1627. 'fields' => array("year"),
  1628. 'actions' => array(
  1629. "/^.*?(\d{4}).*/" => "\\1" // for the 'year' field, extract any four-digit number (and discard everything else)
  1630. )
  1631. ),
  1632. array(
  1633. 'fields' => array("title"),
  1634. 'actions' => array(
  1635. "/[,.;:!] *$/" => "" // remove any punctuation (except for question marks) from end of field contents
  1636. )
  1637. ),
  1638. array(
  1639. 'fields' => array("title", "abstract", "orig_title", "series_title", "abbrev_series_title", "notes"), // convert RefWorks font attributes (which RefWorks supports in title fields, notes, abstracts and user 1 - 5 fields)
  1640. 'actions' => array(
  1641. "/0RW1S34RfeSDcfkexd09rT3(.+?)1RW1S34RfeSDcfkexd09rT3/" => "[super:\\1]", // replace RefWorks indicators for superscript text with refbase markup ('[super:...]')
  1642. "/0RW1S34RfeSDcfkexd09rT4(.+?)1RW1S34RfeSDcfkexd09rT4/" => "[sub:\\1]", // replace RefWorks indicators for subscript text with refbase markup ('[sub:...]')
  1643. "/0RW1S34RfeSDcfkexd09rT2(.+?)1RW1S34RfeSDcfkexd09rT2/" => "_\\1_", // replace RefWorks indicators for italic text with refbase markup ('_..._')
  1644. "/0RW1S34RfeSDcfkexd09rT0(.+?)1RW1S34RfeSDcfkexd09rT0/" => "**\\1**", // replace RefWorks indicators for bold text with refbase markup ('**...**')
  1645. "/0RW1S34RfeSDcfkexd09rT1(.+?)1RW1S34RfeSDcfkexd09rT1/" => "__\\1__" // replace RefWorks indicators for underline text with refbase markup ('__...__')
  1646. )
  1647. )
  1648. );
  1649. // This array lists patterns which match all RefWorks tags that must occur within a record to be recognized as valid RefWorks record:
  1650. // (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
  1651. // the search patterns MUST include the leading & trailing slashes.)
  1652. // "tag display name" => "tag search pattern"
  1653. $requiredTagsArray = array(
  1654. "RT" => "/^RT /m"
  1655. );
  1656. // This array matches RefWorks tags with their corresponding refbase fields:
  1657. // (fields that are unsupported in either RefWorks or refbase are commented out)
  1658. // "RefWorks tag" => "refbase field" // RefWorks tag name (comment)
  1659. $tagsToRefbaseFieldsArray = array(
  1660. "RT" => "type", // Reference Type (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
  1661. // "" => "thesis",
  1662. "A1" => "author", // Primary Authors
  1663. "A2" => "editor", // Secondary Authors (Editors)
  1664. "A3" => "series_editor", // Tertiary Authors (Series Editors)
  1665. // "A4" => "", // Quaternary Authors (Translators)
  1666. // "A5" => "", // Quinary Authors (Compliers)
  1667. // "A6" => "", // Website Editors
  1668. "AD" => "address", // Author Address
  1669. // "" => "corporate_author",
  1670. "T1" => "title", // Primary Title
  1671. "OT" => "orig_title", // Original Foreign Title
  1672. // "ST" => "", // Shortened Title
  1673. // "WT" => "", // Website Title
  1674. // "FD" => "", // Publication Data, Free Form (this field is used for date information such as a season or month and day; year data is solely placed in the year field, i.e., "YR 2003")
  1675. "YR" => "year", // Year
  1676. // "RD" => "", // Retrieved Date
  1677. // "WV" => "", // Website Version
  1678. // "WP" => "", // Date of Electronic Publication
  1679. "JF" => "publication", // Periodical name: full format
  1680. "JO" => "abbrev_journal", // Periodical name: standard abbreviation
  1681. "T2" => array("Book, Section" => "publication", "Other" => "series_title"), // Secondary Title
  1682. "T3" => "abbrev_series_title", // Tertiary Title
  1683. "VO" => "volume", // Volume
  1684. // "NV" => "", // Number of volumes
  1685. "IS" => "issue", // Issue
  1686. "SP" => "startPage", // Start Page (contents of the special fields 'startPage' and 'endPage' will be merged into a range and copied to the refbase 'pages' field)
  1687. "OP" => "endPage", // Either [1] "endPage", Other Pages ('SP' is the tag for the starting page and should only contain this information; the 'OP' tag is used for any additional pages or page information) or [2] Original publication
  1688. // "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
  1689. // "" => "series_issue",
  1690. "PB" => "publisher", // Publisher
  1691. "PP" => "place", // Place of Publication
  1692. "ED" => "edition", // Edition
  1693. // "" => "medium",
  1694. "SN" => array("Book, Section" => "isbn", "Book, Edited" => "isbn", "Book, Whole" => "isbn", "Dissertation" => "isbn", "Dissertation/Thesis" => "isbn", "Other" => "issn"), // Book Whole & Book Chapter: ISBN; Other reference types: ISSN
  1695. "LA" => "language", // Language
  1696. // "" => "summary_language",
  1697. "K1" => "keywords", // Keywords
  1698. "AB" => "abstract", // Abstract
  1699. // "" => "area",
  1700. // "" => "expedition",
  1701. // "" => "conference",
  1702. "DO" => "doi", // Digital Object Identifier
  1703. "LK" => "url", // Links
  1704. "UL" => "url", // URL
  1705. // "" => "file", // Link to PDF
  1706. // "" => "related", // Related Records
  1707. "NO" => "notes", // Notes
  1708. "ID" => "call_number", // Reference Identifier (NOTE: if no other field gets mapped to the 'cite_key' field, the contents of the 'call_number' field will be also copied to the 'cite_key' field of the currently logged-in user)
  1709. "CN" => "notes", // Call Number (if 'ID' would be mapped to 'cite_key', contents of this field could go into the 'call_number' field)
  1710. "IP" => "notes", // Identifying Phrase (NOTE: should we rather put the contents of this field into the 'cite_key' field?)
  1711. // "U1" => "", // User definable 1
  1712. // "U2" => "", // User definable 2
  1713. // "U3" => "", // User definable 3
  1714. // "U4" => "", // User definable 4
  1715. // "U5" => "", // User definable 5
  1716. // "" => "contribution_id",
  1717. // "" => "online_publication",
  1718. // "" => "online_citation",
  1719. // "" => "approved",
  1720. // "" => "orig_record",
  1721. // "" => "copy", // Reprint status
  1722. "AV" => "notes", // Availability
  1723. // "AN" => "", // Accession Number
  1724. // "CA" => "", // Caption
  1725. // "CL" => "", // Classification
  1726. // "SF" => "", // Subfile/Database
  1727. // "DB" => "", // Database
  1728. // "DS" => "", // Data Source
  1729. // "SL" => "", // Sponsoring Library
  1730. // "LL" => "", // Sponsoring Library Location
  1731. // "CR" => "", // Cited References
  1732. );
  1733. // This array lists all RefWorks tags that may occur multiple times:
  1734. $tagsMultipleArray = array(
  1735. "A1",
  1736. "A2",
  1737. "A3",
  1738. // "A4",
  1739. // "A5",
  1740. // "A6",
  1741. "K1",
  1742. // "LK", // currently, refbase does only support one link per record
  1743. // "UL", // currently, refbase does only support one URL per record
  1744. "ID",
  1745. "CN",
  1746. "IP",
  1747. "NO",
  1748. "AV"
  1749. );
  1750. // This array matches RefWorks reference types with their corresponding refbase types:
  1751. // (RefWorks types that are currently not supported in refbase will be taken as is but will get
  1752. // prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
  1753. // is not a perfect match but as close as currently possible)
  1754. // "RefWorks type" => "refbase type" // name of RefWorks reference type (comment)
  1755. $referenceTypesToRefbaseTypesArray = array(
  1756. "Abstract" => "Abstract", // Abstract
  1757. "Artwork" => "Unsupported: Artwork", // Artwork
  1758. "Bills\/Resolutions" => "Unsupported: Bills/Resolutions", // Bills/Resolutions
  1759. "Book,? (Section|Chapter)" => "Book Chapter", // Book, Section
  1760. "Book, Edited" => "Book Whole", // Book, Edited (#fallback#)
  1761. "Book, Whole" => "Book Whole", // Book, Whole
  1762. "Case\/Court Decisions" => "Unsupported: Case/Court Decisions", // Case/Court Decisions
  1763. "Computer Program" => "Software", // Computer Program
  1764. "Conference Proceeding" => "Conference Article", // Conference Proceeding
  1765. "Dissertation(\/Thesis)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
  1766. "Dissertation(\/Thesis)?, Unpublished" => "Thesis", // Dissertation/Thesis, Unpublished (#fallback#) (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
  1767. "Generic" => "Miscellaneous", // Generic
  1768. "Grant" => "Unsupported: Grant", // Grant
  1769. "Hearing" => "Unsupported: Hearing", // Hearing
  1770. "Journal" => "Journal Article", // Journal
  1771. "Journal, Electronic" => "Journal Article", // Journal, Electronic (#fallback#) (function 'parseRecords()' should set the 'online_publication' field accordingly)
  1772. "Laws\/Statutes" => "Unsupported: Laws/Statutes", // Laws/Statutes
  1773. "Magazine Article" => "Magazine Article", // Magazine Article
  1774. "Map" => "Map", // Map
  1775. "Monograph" => "Book Whole", // Monograph (#fallback#)
  1776. "Motion Picture" => "Unsupported: Motion Picture", // Motion Picture
  1777. "Music Score" => "Unsupported: Music Score", // Music Score
  1778. "Newspaper Article" => "Newspaper Article", // Newspaper Article
  1779. "Online Discussion Forum" => "Unsupported: Online Discussion Forum", // Online Discussion Forum
  1780. "Patent" => "Patent", // Patent
  1781. "Personal Communication" => "Unsupported: Personal Communication", // Personal Communication
  1782. "Report" => "Report", // Report
  1783. "Sound Recording" => "Unsupported: Sound Recording", // Sound Recording
  1784. "Thesis(\/Dissertation)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
  1785. "Unpublished Material" => "Manuscript", // Unpublished Material (#fallback#)
  1786. "Video\/DVD" => "Unsupported: Video/DVD", // Video/DVD
  1787. "Web Page" => "Unsupported: Web Page" // Web Page
  1788. );
  1789. // -----------------------------------------
  1790. // Split input text into individual records:
  1791. $recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "ER" (= end of record) tag that terminates every RefWorks record
  1792. // Validate all records that shall be imported:
  1793. list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
  1794. // Parse all records that shall be imported:
  1795. list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "RefWorks", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
  1796. // Build refbase import array:
  1797. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  1798. "1.0", // 'version' - the version of the given array structure
  1799. "http://refbase.net/import/refworks/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  1800. "Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
  1801. "refbase@extracts.de", // 'contact' - author's email/contact address
  1802. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  1803. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  1804. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  1805. }
  1806. // --------------------------------------------------------------------
  1807. // SCIFINDER TO REFBASE
  1808. // This function converts records from SciFinder (<http://www.cas.org/SCIFINDER/>) Tagged Format
  1809. // into the standard "refbase" array format which can be then imported by the 'addRecords()' function
  1810. // in 'include.inc.php'.
  1811. function scifinderToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
  1812. {
  1813. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  1814. global $errors;
  1815. global $showSource;
  1816. // The SciFinder format uses variable-length field label names, which makes it
  1817. // impossible to match field labels using regular expressions with perl-style
  1818. // look-behinds (such as '(?<=...)'). This poses a problem when specifying an
  1819. // appropriate regex pattern for variable '$dataDelimiter'. Therefore, we'll
  1820. // preprocess the '$sourceText' so that delimiters between field labels and
  1821. // field data can be easily matched.
  1822. $sourceText = preg_replace("/^(FIELD [^:\r\n]+):/m", "\\1__dataDelimiter__", $sourceText); // replace the first colon (":"), which separates a field label from its data, with a custom string ("__dataDelimiter__")
  1823. // Define regular expression patterns that will facilitate parsing of SciFinder data:
  1824. // (patterns must be specified as perl-style regular expression, without the leading & trailing slashes, if not stated otherwise)
  1825. // Pattern by which the input text will be split into individual records:
  1826. $recordDelimiter = "\s*(START_RECORD[\r\n]+|[\r\n]+END_RECORD)\s*";
  1827. // Pattern by which records will be split into individual fields:
  1828. $fieldDelimiter = "[\r\n]+FIELD *";
  1829. // Pattern by which fields will be split into their field label (tag) and field data:
  1830. $dataDelimiter = " *__dataDelimiter__ *";
  1831. // Pattern by which multiple persons are separated within the author, editor or series editor fields of the source data:
  1832. // (Notes: - name standardization occurs after multiple author fields have been merged by '; '
  1833. // - the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1834. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1835. $personDelimiter = "/ *; */";
  1836. // Pattern by which a person's family name is separated from the given name (or initials):
  1837. // (the split pattern must be specified as perl-style regular expression (including the leading & trailing
  1838. // slashes) and may include mode modifiers (such as '/.../i' to perform a case insensitive match))
  1839. $familyNameGivenNameDelimiter = "/ *, */";
  1840. // Specifies whether the person's family name comes first within a person's name
  1841. // ('true' means that the family name is followed by the given name (or initials), 'false' means that the person's family name comes *after* the given name (or initials))
  1842. $familyNameFirst = true;
  1843. // Specifies whether a person's full given name(s) shall be shortened to initial(s):
  1844. // (Notes: - if set to 'true', given names will be abbreviated and initials will get normalized (meaning removal of extra whitespace, adding of dots between initials, etc)
  1845. // - if set to 'false', given names (and any initials) are taken as is
  1846. // - in your database, you should stick to either fully written given names OR initials; if you mix these, records won't get sorted correctly on citation output)
  1847. $shortenGivenNames = true;
  1848. // Specifies whether fields whose contents are entirely in upper case shall be transformed to title case ('true') or not ('false'):
  1849. $transformCase = true;
  1850. // Preprocessor actions:
  1851. // Defines search & replace 'actions' that will be applied to each record's raw source data if the pattern in the corresponding 'match' element is matched:
  1852. // (If you don't want to perform any preprocessor actions, specify an empty array, like: '$preprocessorActionsArray = array();'.
  1853. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1854. // "/Search Pattern/" => "Replace Pattern"
  1855. $preprocessorActionsArray = array();
  1856. // Postprocessor actions:
  1857. // Defines search & replace 'actions' that will be applied to all those refbase fields that are listed in the corresponding 'fields' element:
  1858. // (If you don't want to perform any search and replace actions, specify an empty array, like: '$postprocessorActionsArray = array();'.
  1859. // Note that, in this case, the search patterns MUST include the leading & trailing slashes -- which is done to allow for mode modifiers such as 'imsxU'.)
  1860. // "/Search Pattern/" => "Replace Pattern"
  1861. $postprocessorActionsArray = array(
  1862. array(
  1863. 'fields' => array("year"),
  1864. 'actions' => array(
  1865. "/^.*?(\d{4}).*/" => "\\1", // for the 'year' field, extract any four-digit number (and discard everything else)
  1866. "/^\D+$/" => "" // clear the 'year' field if it doesn't contain any number
  1867. )
  1868. ),
  1869. array(
  1870. 'fields' => array("pages"),
  1871. 'actions' => array(
  1872. "/(\d+ *pp?)\./" => "\\1" // strip any trailing dots from "xx pp." or "xx p." in the 'pages' field
  1873. )
  1874. ),
  1875. array(
  1876. 'fields' => array("title", "address"),
  1877. 'actions' => array(
  1878. "/[,.;:!] *$/" => "", // remove any punctuation (except for question marks) from end of field contents
  1879. "/,(?! )/" => ", " // add a space after a comma if missing (this mainly regards the 'Corporate Source' -> 'address' field)
  1880. )
  1881. ),
  1882. array(
  1883. 'fields' => array("abstract"),
  1884. 'actions' => array(
  1885. '/\\\\"/' => '"', // convert escaped quotes (\") into unescaped quotes (")
  1886. "/ *\[on SciFinder \(R\)\]$/" => "" // remove attribution string " [on SciFinder (R)]" from end of field contents
  1887. )
  1888. ),
  1889. array(
  1890. 'fields' => array("language"),
  1891. 'actions' => array(
  1892. "/^[$lower$punct ]+(?=[$upper][$lower]+)/$patternModifiers" => "", // remove any all-lowercase prefix string (so that field contents such as "written in English." get reduced to "English.")
  1893. "/language unavailable/" => "", // remove "language unavailable" string
  1894. "/[$punct] *$/$patternModifiers" => "" // remove any punctuation from end of field contents
  1895. )
  1896. ),
  1897. array(
  1898. 'fields' => array("notes"),
  1899. 'actions' => array(
  1900. "/^Can (\d+)/" => "CAN:\\1", // convert any existing "CAN " prefix in front of any number that's at the beginning of the 'notes' field (which originated from the SciFinder 'Chemical Abstracts Number(CAN)' field)
  1901. "/^(\d+)/" => "CAN:\\1" // insert a "CAN:" prefix in front of any number that's at the beginning of the 'notes' field (we map the SciFinder 'Chemical Abstracts Number(CAN)' field to the 'notes' field)
  1902. )
  1903. )
  1904. );
  1905. // This array lists patterns which match all SciFinder tags that must occur within a record to be recognized as valid SciFinder record:
  1906. // (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
  1907. // the search patterns MUST include the leading & trailing slashes.)
  1908. // "tag display name" => "tag search pattern"
  1909. $requiredTagsArray = array(
  1910. "Document Type" => "/^FIELD Document Type/m"
  1911. );
  1912. // This array matches SciFinder tags with their corresponding refbase fields:
  1913. // (fields that are unsupported in either SciFinder or refbase are commented out)
  1914. // "SciFinder tag" => "refbase field" // SciFinder tag name (comment)
  1915. $tagsToRefbaseFieldsArray = array(
  1916. "Document Type" => "type", // Document Type (IMPORTANT: the array element that maps to 'type' must be listed as the first element!)
  1917. // "" => "thesis",
  1918. "Author" => "author", // Primary Authors
  1919. // "" => "editor", // Secondary Authors (Editors)
  1920. // "" => "series_editor", // Tertiary Authors (Series Editors)
  1921. "Corporate Source" => "address", // Corporate Source
  1922. // "" => "corporate_author", // Corporate Author
  1923. "Title" => "title", // Primary Title
  1924. // "" => "orig_title", // Original Foreign Title
  1925. "Publication Year" => "year", // Publication Year
  1926. "Publication Date" => "year", // Publication Date
  1927. "Journal Title" => "publication", // Periodical name: full format
  1928. // "" => "abbrev_journal", // Periodical name: standard abbreviation
  1929. // "" => array("Book, Section" => "publication", "Other" => "series_title"), // Secondary Title
  1930. // "" => "abbrev_series_title", // Tertiary Title
  1931. "Volume" => "volume", // Volume
  1932. "Issue" => "issue", // Issue
  1933. "Page" => "pages", // Page
  1934. // "" => "series_volume", // (for 'series_volume' and 'series_issue', some magic will be applied within the 'parseRecords()' function)
  1935. // "" => "series_issue",
  1936. // "" => "publisher", // Publisher
  1937. // "" => "place", // Place of Publication
  1938. // "" => "edition", // Edition
  1939. // "" => "medium", // Medium
  1940. "Internat.Standard Doc. Number" => array("Book, Section" => "isbn", "Book, Edited" => "isbn", "Book" => "isbn", "Dissertation" => "isbn", "Dissertation/Thesis" => "isbn", "Other" => "issn"), // Book Whole & Book Chapter: ISBN; Other reference types: ISSN
  1941. "Language" => "language", // Language
  1942. // "" => "summary_language", // Summary Language
  1943. "Index Terms" => "keywords", // Index Terms
  1944. // "Index Terms(2)" => "keywords", // Index Terms(2)
  1945. "Abstract" => "abstract", // Abstract
  1946. // "" => "area",
  1947. // "" => "expedition",
  1948. // "" => "conference",
  1949. // "" => "doi", // Digital Object Identifier
  1950. "URL" => "url", // URL
  1951. // "" => "file", // Link to PDF
  1952. // "" => "related", // Related Records
  1953. // "" => "call_number", // Call Number
  1954. "Chemical Abstracts Number(CAN)" => "notes", // Chemical Abstracts Number(CAN)
  1955. // "" => "contribution_id",
  1956. // "" => "online_publication",
  1957. // "" => "online_citation",
  1958. // "" => "approved",
  1959. // "" => "orig_record",
  1960. // "" => "copy", // Reprint status
  1961. // "Copyright" => "", // Copyright
  1962. // "Database" => "", // Database
  1963. // "Accession Number" => "", // Accession Number
  1964. // "Section Code" => "", // Section Code
  1965. // "Section Title" => "", // Section Title
  1966. // "CA Section Cross-references" => "", // CA Section Cross-references
  1967. // "CODEN" => "", // CODEN
  1968. // "CAS Registry Numbers" => "", // CAS Registry Numbers
  1969. // "Supplementary Terms" => "", // Supplementary Terms
  1970. // "PCT Designated States" => "", // PCT Designated States
  1971. // "PCT Reg. Des. States" => "", // PCT Reg. Des. States
  1972. // "Reg.Pat.Tr.Des.States" => "", // Reg.Pat.Tr.Des.States
  1973. // "Main IPC" => "", // Main IPC
  1974. // "IPC" => "", // IPC
  1975. // "Secondary IPC" => "", // Secondary IPC
  1976. // "Additional IPC" => "", // Additional IPC
  1977. // "Index IPC" => "", // Index IPC
  1978. // "Inventor Name" => "", // Inventor Name
  1979. // "National Patent Classification" => "", // National Patent Classification
  1980. // "Patent Application Country" => "", // Patent Application Country
  1981. // "Patent Application Date" => "", // Patent Application Date
  1982. // "Patent Application Number" => "", // Patent Application Number
  1983. // "Patent Assignee" => "", // Patent Assignee
  1984. // "Patent Country" => "", // Patent Country
  1985. // "Patent Kind Code" => "", // Patent Kind Code
  1986. // "Patent Number" => "", // Patent Number
  1987. // "Priority Application Country" => "", // Priority Application Country
  1988. // "Priority Application Number" => "", // Priority Application Number
  1989. // "Priority Application Date" => "", // Priority Application Date
  1990. // "Citations" => "", // Citations
  1991. );
  1992. // This array lists all SciFinder tags that may occur multiple times:
  1993. $tagsMultipleArray = array(
  1994. // "Chemical Abstracts Number(CAN)",
  1995. // "Index Terms", // by allowing "Index Terms" and "Index Terms(2)" to occur multiple times we can merge contents of both of these fields into the 'keywords' field
  1996. // "Index Terms(2)",
  1997. "Publication Year", // by allowing "Publication Year" and "Publication Date" to occur multiple times we can merge contents of both of these fields into the 'year' field (then, we'll extract the first four-digit number from it)
  1998. "Publication Date"
  1999. );
  2000. // This array matches SciFinder reference types with their corresponding refbase types:
  2001. // (SciFinder types that are currently not supported in refbase will be taken as is but will get
  2002. // prefixed with an "Unsupported: " label; '#fallback#' in comments indicates a type mapping that
  2003. // is not a perfect match but as close as currently possible)
  2004. // (NOTE: the commented reference types are NOT from SciFinder but are remains from the 'refworksToRefbase()' function!)
  2005. // "SciFinder type" => "refbase type" // name of SciFinder reference type (comment)
  2006. $referenceTypesToRefbaseTypesArray = array(
  2007. // "Abstract" => "Abstract", // Abstract
  2008. // "Artwork" => "Unsupported: Artwork", // Artwork
  2009. // "Bills\/Resolutions" => "Unsupported: Bills/Resolutions", // Bills/Resolutions
  2010. // "Book,? (Section|Chapter)" => "Book Chapter", // Book, Section
  2011. // "Book, Edited" => "Book Whole", // Book, Edited (#fallback#)
  2012. "Book(;.*)?" => "Book Whole", // Book
  2013. // "Case\/Court Decisions" => "Unsupported: Case/Court Decisions", // Case/Court Decisions
  2014. // "Computer Program" => "Software", // Computer Program
  2015. // "Conference Proceeding" => "Conference Article", // Conference Proceeding
  2016. // "Dissertation(\/Thesis)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
  2017. // "Dissertation(\/Thesis)?, Unpublished" => "Thesis", // Dissertation/Thesis, Unpublished (#fallback#) (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
  2018. // "Generic" => "Miscellaneous", // Generic
  2019. // "Grant" => "Unsupported: Grant", // Grant
  2020. // "Hearing" => "Unsupported: Hearing", // Hearing
  2021. "Journal(;.*)?" => "Journal Article", // Journal
  2022. // "Journal, Electronic" => "Journal Article", // Journal, Electronic (#fallback#) (function 'parseRecords()' should set the 'online_publication' field accordingly)
  2023. // "Laws\/Statutes" => "Unsupported: Laws/Statutes", // Laws/Statutes
  2024. // "Magazine Article" => "Magazine Article", // Magazine Article
  2025. // "Map" => "Map", // Map
  2026. // "Monograph" => "Book Whole", // Monograph (#fallback#)
  2027. // "Motion Picture" => "Unsupported: Motion Picture", // Motion Picture
  2028. // "Music Score" => "Unsupported: Music Score", // Music Score
  2029. // "Newspaper Article" => "Newspaper Article", // Newspaper Article
  2030. // "Online Discussion Forum" => "Unsupported: Online Discussion Forum", // Online Discussion Forum
  2031. // "Patent" => "Patent", // Patent
  2032. // "Personal Communication" => "Unsupported: Personal Communication", // Personal Communication
  2033. "Report(;.*)?" => "Report", // Report
  2034. // "Sound Recording" => "Unsupported: Sound Recording", // Sound Recording
  2035. // "Thesis(\/Dissertation)?" => "Thesis", // Dissertation/Thesis (function 'parseRecords()' will set the special type 'Thesis' back to 'Book Whole' and adopt the refbase 'thesis' field)
  2036. "Preprint" => "Manuscript", // Preprint (#fallback#)
  2037. // "Video\/DVD" => "Unsupported: Video/DVD", // Video/DVD
  2038. // "Web Page" => "Unsupported: Web Page" // Web Page
  2039. );
  2040. // Other SciFinder Document Types which I've encountered so far:
  2041. // "General Review" => "" // General Review
  2042. // "Online Computer File" => "" // Online Computer File
  2043. // -----------------------------------------
  2044. // Split input text into individual records:
  2045. $recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split on the "START_RECORD"/"END_RECORD" tags that delimit every SciFinder record
  2046. // Validate all records that shall be imported:
  2047. list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
  2048. // Parse all records that shall be imported:
  2049. list($parsedRecordsArray, $recordsCount) = parseRecords($recordArray, "SciFinder", $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray);
  2050. // Build refbase import array:
  2051. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  2052. "1.0", // 'version' - the version of the given array structure
  2053. "http://refbase.net/import/scifinder/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  2054. "Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
  2055. "refbase@extracts.de", // 'contact' - author's email/contact address
  2056. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  2057. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  2058. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  2059. }
  2060. // --------------------------------------------------------------------
  2061. // IDENTIFY SOURCE FORMAT
  2062. // This function tries to identify the format of the input text:
  2063. function identifySourceFormat($sourceText)
  2064. {
  2065. $sourceFormat = "";
  2066. // CSA format:
  2067. if (preg_match("/^Record \d+ of \d+/m", $sourceText) AND preg_match("/^SO: Source *[\r\n]+ {4,4}/m", $sourceText)) // CSA records must at least start with a record identifier ("Record x of xx") and contain the "SO: Source" tag
  2068. $sourceFormat = "CSA";
  2069. // PubMed MEDLINE format:
  2070. elseif (preg_match("/^PMID- /m", $sourceText) AND preg_match("/^PT - /m", $sourceText)) // PubMed MEDLINE records must at least contain the "PMID" and "PT" tags
  2071. $sourceFormat = "Pubmed Medline";
  2072. // PubMed XML format:
  2073. elseif (preg_match("/<PubmedArticle[^<>\r\n]*>/i", $sourceText) AND preg_match("/<\/PubmedArticle>/", $sourceText)) // PubMed XML records must at least contain the "<PubmedArticle>...</PubmedArticle>" root element
  2074. $sourceFormat = "Pubmed XML";
  2075. // ISI Web of Science format:
  2076. elseif (preg_match("/^PT /m", $sourceText) AND preg_match("/^SO /m", $sourceText) AND preg_match("/^ER *[\r\n]/m", $sourceText)) // ISI records must at least contain the "PT" and "SO" tags and end with an "ER" tag
  2077. $sourceFormat = "ISI";
  2078. // RIS format:
  2079. elseif (preg_match("/^TY - /m", $sourceText) AND preg_match("/^ER -/m", $sourceText)) // RIS records must at least start with the "TY" tag and end with an "ER" tag (we'll only check for their presence, though)
  2080. $sourceFormat = "RIS";
  2081. // RefWorks format:
  2082. elseif (preg_match("/^RT /m", $sourceText)) // RefWorks records must at least start with the "RT" tag (we'll only check for its presence, though)
  2083. $sourceFormat = "RefWorks";
  2084. // SciFinder format:
  2085. elseif (preg_match("/^START_RECORD/m", $sourceText) AND preg_match("/^END_RECORD/m", $sourceText)) // SciFinder records must at least start with the "START_RECORD" tag and end with an "END_RECORD" tag (we'll only check for their presence, though)
  2086. $sourceFormat = "SciFinder";
  2087. // Copac format:
  2088. elseif (preg_match("/^TI- /m", $sourceText) AND preg_match("/^HL- /m", $sourceText)) // Copac records must at least contain the "TI" and "HL" tags
  2089. $sourceFormat = "Copac";
  2090. // Endnote format:
  2091. elseif (preg_match("/^%0 /m", $sourceText)) // Endnote records must at least contain the "%0" tag
  2092. $sourceFormat = "Endnote"; // Endnote tagged text aka Endnote Refer
  2093. // MODS XML format:
  2094. elseif (preg_match("/<mods[^<>\r\n]*>/i", $sourceText) AND preg_match("/<\/mods>/", $sourceText)) // MODS XML records must at least contain the "<mods>...</mods>" root element
  2095. $sourceFormat = "MODS XML";
  2096. // Endnote XML format:
  2097. elseif (preg_match("/<xml>[^<>]*?<records>[^<>]*?<record>/mi", $sourceText)) // Endnote XML records must at least contain the elements "<xml>...<records>...<record>"
  2098. $sourceFormat = "Endnote XML";
  2099. // BibTeX format:
  2100. elseif (preg_match("/^@\w+\s*\{[^ ,\r\n]* *, *[\r\n]/m", $sourceText)) // BibTeX records must start with the "@" sign, followed by a type specifier and an optional cite key (such as in '@article{steffens1988,')
  2101. $sourceFormat = "BibTeX";
  2102. // CrossRef "unixref" XML format:
  2103. // TODO: improve match
  2104. elseif (preg_match("/<doi_records[^<>\r\n]*>/i", $sourceText) AND preg_match("/<\/doi_records>/", $sourceText)) // CrossRef XML records must at least contain the "<doi_records>...</doi_records>" root element
  2105. $sourceFormat = "CrossRef XML";
  2106. // arXiv.org Atom XML OpenSearch format:
  2107. // TODO: add regex pattern that matches arXiv.org Atom feeds
  2108. return $sourceFormat;
  2109. }
  2110. // --------------------------------------------------------------------
  2111. // IDENTIFY SOURCE ID
  2112. // This function tries to identify the type of the IDs contained in the input string:
  2113. // TODO:
  2114. // - modify the code so that '$sourceIDs' can contain a mixture of any supported IDs
  2115. // - after splitting on whitespace, verify ALL items and check whether they match one of the recognized ID patterns
  2116. // - better identification/verification of OpenURLs
  2117. // - to support OpenURL context objects from COinS or Atom XML, we need to decode ampersand characters ('&amp;' -> '&'),
  2118. // and allow for OpenURLs that don't start with '?' or '&'
  2119. function identifySourceID($sourceIDs)
  2120. {
  2121. $idFormat = "";
  2122. // DOIs:
  2123. if (preg_match("#(?<=^|\s)(doi:|http://dx\.doi\.org/)?10\.\d{4}/\S+?(?=$|\s)#i", $sourceIDs))
  2124. $idFormat = "CrossRef XML";
  2125. // OpenURLs:
  2126. elseif (preg_match("#(?<=^|\s)(openurl:|http://.+?(?=\?))?.*?(?<=[?&])ctx_ver=Z39\.88-2004(?=&|$).*?(?=$|\s)#i", $sourceIDs)) // OpenURLs must contain the 'ctx_ver=Z39.88-2004' key/value pair
  2127. $idFormat = "CrossRef XML";
  2128. // arXiv IDs:
  2129. elseif (preg_match("#(?<=^|\s)(arXiv:|http://arxiv\.org/abs/)?([\w.-]+/\d{7}|\d{4}\.\d{4,})(v\d+)?(?=$|\s)#i", $sourceIDs))
  2130. $idFormat = "arXiv XML";
  2131. // PubMed IDs:
  2132. elseif (preg_match("/(?<=^|\s)\d+(?=$|\s)/", $sourceIDs))
  2133. $idFormat = "Pubmed Medline";
  2134. return $idFormat;
  2135. }
  2136. // --------------------------------------------------------------------
  2137. // SPLIT SOURCE TEXT
  2138. // This function splits the input text at the specified delimiter and returns an array of records:
  2139. function splitSourceText($sourceText, $splitPattern, $returnEmptyElements)
  2140. {
  2141. if ($returnEmptyElements) // include empty elements:
  2142. $recordArray = preg_split("/" . $splitPattern . "/", $sourceText);
  2143. else // omit empty elements:
  2144. $recordArray = preg_split("/" . $splitPattern . "/", $sourceText, -1, PREG_SPLIT_NO_EMPTY); // the 'PREG_SPLIT_NO_EMPTY' flag causes only non-empty pieces to be returned
  2145. return $recordArray;
  2146. }
  2147. // --------------------------------------------------------------------
  2148. // VALIDATE RECORDS
  2149. // This function takes an array of records containing the source data (as tagged text) and
  2150. // checks for each record if any of the required fields (given in '$requiredTagsArray') are missing:
  2151. function validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors)
  2152. {
  2153. // count how many records are available:
  2154. $recordsCount = count($recordArray);
  2155. $importRecordNumbersRecognizedFormatArray = array(); // initialize array variable which will hold all record numbers of those records that shall be imported AND which were of a recognized format
  2156. $importRecordNumbersNotRecognizedFormatArray = array(); // same for all records that shall be imported BUT which had an UNrecognized format
  2157. for ($i=0; $i<$recordsCount; $i++) // for each record...
  2158. {
  2159. if (($importRecordsRadio == "only") AND (!in_array(($i+1), $importRecordNumbersArray))) // if we're NOT supposed to import this record... ('$i' starts with 0 so we have to add 1 to point to the correct record number)
  2160. {
  2161. continue; // process next record (if any)
  2162. }
  2163. else // ...validate the format of the current record:
  2164. {
  2165. $missingTagsArray = array();
  2166. // check for required fields:
  2167. if (!empty($recordArray[$i]))
  2168. foreach ($requiredTagsArray as $requiredTagName => $requiredTagPattern)
  2169. if (!preg_match($requiredTagPattern, $recordArray[$i])) // if required field is missing
  2170. $missingTagsArray[] = $requiredTagName;
  2171. // we assume a single record as valid if the '$recordArray[$i]' variable is not empty
  2172. // and if all tag search patterns in '$requiredTagsArray' were matched:
  2173. if (!empty($recordArray[$i]) AND empty($missingTagsArray))
  2174. {
  2175. $importRecordNumbersRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format IS recognized ('$i' starts with 0 so we have to add 1 to point to the correct record number)
  2176. }
  2177. else // unrecognized record format
  2178. {
  2179. $importRecordNumbersNotRecognizedFormatArray[] = $i + 1; // append this record number to the list of numbers whose record format is NOT recognized
  2180. // prepare an appropriate error message:
  2181. $errorMessage = "Record " . ($i + 1) . ":";
  2182. // Handle PubMed Medline errors:
  2183. // TODO: - improve identification of Medline errors
  2184. // - handle PubMed XML
  2185. if (preg_match("/^\s*<html>/i", $recordArray[$i]) AND preg_match("/Error occurred:/", $recordArray[$i])) // a PubMed error occurred, probably because an unrecognized PubMed ID was given
  2186. $errorMessage .= preg_replace("/.*Error occurred: *([^<>]+).*/s", " PubMed error: \\1.", $recordArray[$i]); // attempt to extract PubMed error message
  2187. else
  2188. {
  2189. $errorMessage .= " Unrecognized data format!";
  2190. if (!empty($missingTagsArray)) // some required fields were missing
  2191. {
  2192. if (count($missingTagsArray) == 1) // one field missing
  2193. $errorMessage .= " Required field missing: " . $missingTagsArray[0];
  2194. else // several fields missing
  2195. $errorMessage .= " Required fields missing: " . implode(', ', $missingTagsArray);
  2196. }
  2197. }
  2198. if (!isset($errors["sourceText"]))
  2199. $errors["sourceText"] = $errorMessage;
  2200. else
  2201. $errors["sourceText"] = $errors["sourceText"] . "<br>" . $errorMessage;
  2202. }
  2203. }
  2204. }
  2205. return array($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray);
  2206. }
  2207. // --------------------------------------------------------------------
  2208. // PARSE RECORDS
  2209. // This function processes an array of records containing the source data (as tagged text) and
  2210. // returns an array of records where each record contains an array of extracted field data:
  2211. function parseRecords($recordArray, $recordFormat, $importRecordNumbersRecognizedFormatArray, $tagsToRefbaseFieldsArray, $tagsMultipleArray, $referenceTypesToRefbaseTypesArray, $fieldDelimiter, $dataDelimiter, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray, $preprocessorActionsArray)
  2212. {
  2213. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  2214. global $showSource;
  2215. $parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
  2216. $recordsCount = count($recordArray); // count how many records are available
  2217. // LOOP OVER EACH RECORD:
  2218. for ($i=0; $i<$recordsCount; $i++) // for each record...
  2219. {
  2220. // if we're NOT supposed to import this record (because it was either not selected by the user -OR- because it did contain an unrecognized data format)
  2221. if (!in_array(($i+1), $importRecordNumbersRecognizedFormatArray)) // '$i' starts with 0 so we have to add 1 to point to the correct record number
  2222. {
  2223. continue; // process next record (if any)
  2224. }
  2225. else // ...import the current record:
  2226. {
  2227. // PRE-PROCESS FIELD DATA:
  2228. // apply search & replace 'actions' to each record's raw source data:
  2229. foreach ($preprocessorActionsArray as $thisMatchActionsArray)
  2230. if (preg_match($thisMatchActionsArray['match'], $recordArray[$i]))
  2231. $recordArray[$i] = searchReplaceText($thisMatchActionsArray['actions'], $recordArray[$i], true); // function 'searchReplaceText()' is defined in 'include.inc.php'
  2232. // split each record into its fields:
  2233. $fieldArray = preg_split("/" . $fieldDelimiter . "/", $recordArray[$i]);
  2234. // initialize some variables:
  2235. $fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
  2236. $tagContentsMultipleArray = array(); // this array will hold individual items of tags that can occur multiple times
  2237. // LOOP OVER EACH FIELD:
  2238. foreach ($fieldArray as $singleField) // for each field within the current record...
  2239. {
  2240. // split each field into its tag and its field data:
  2241. list($fieldLabel, $fieldData) = preg_split("/" . $dataDelimiter . "/", $singleField);
  2242. if (isset($tagsToRefbaseFieldsArray[$fieldLabel])) // if the current tag is one we'd like to import
  2243. {
  2244. $fieldData = preg_replace("/\s{2,}/", " ", $fieldData); // remove any hard returns and extra spaces within the data string
  2245. $fieldData = trim($fieldData); // remove any preceeding and trailing whitespace from the field data
  2246. // if all of the field data is in uppercase letters, we attempt to convert the string to something more readable:
  2247. // NOTE: while case transformation is also done in function 'standardizeFieldData()', we cannot omit it here
  2248. // since tags that can occur multiple times must be treated individually (i.e. before merging them)
  2249. if ($transformCase AND ($tagsToRefbaseFieldsArray[$fieldLabel] != "type")) // we exclude reference types from any case transformations
  2250. // TODO: we should probably only use Unicode-aware expressions here (i.e. something like "/^([$upper$digit]|[^$word])+$/$patternModifiers")
  2251. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $fieldData))
  2252. // convert upper case to title case (converts e.g. "ELSEVIER SCIENCE BV" into "Elsevier Science Bv"):
  2253. // (note that this case transformation won't do the right thing for author initials and abbreviations,
  2254. // but the result is better than the whole string being upper case, IMHO)
  2255. $fieldData = changeCase('title', $fieldData); // function 'changeCase()' is defined in 'include.inc.php'
  2256. // extract individual items of tags that can occur multiple times:
  2257. foreach ($tagsMultipleArray as $tagMultiple)
  2258. {
  2259. if (preg_match("/^" . $tagMultiple . "$/i", $fieldLabel))
  2260. {
  2261. if(!is_array($tagsToRefbaseFieldsArray[$fieldLabel]))
  2262. {
  2263. $tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$fieldLabel]][] = $fieldData;
  2264. }
  2265. else // if the current tag's value in '$tagsToRefbaseFieldsArray' is an array...
  2266. {
  2267. // ...we'll copy field data to different refbase fields depending on the current records reference type:
  2268. // NOTE: this will only work if the array element that maps to 'type' has been already parsed,
  2269. // which is why '$tagsToRefbaseFieldsArray' should contain this as the first element!
  2270. $useDefault = true;
  2271. foreach ($tagsToRefbaseFieldsArray[$fieldLabel] as $referenceType => $refbaseField)
  2272. {
  2273. if ($fieldParametersArray['type'] == $referenceType)
  2274. {
  2275. $tagContentsMultipleArray[$refbaseField][] = $fieldData;
  2276. $useDefault = false;
  2277. break;
  2278. }
  2279. }
  2280. if ($useDefault AND isset($tagsToRefbaseFieldsArray[$fieldLabel]['Other']))
  2281. $tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$fieldLabel]['Other']][] = $fieldData;
  2282. }
  2283. }
  2284. }
  2285. // copy field data to array of field parameters (using the corresponding refbase field name as element key):
  2286. if(!is_array($tagsToRefbaseFieldsArray[$fieldLabel]))
  2287. {
  2288. $fieldParametersArray[$tagsToRefbaseFieldsArray[$fieldLabel]] = $fieldData;
  2289. }
  2290. else // if the current tag's value in '$tagsToRefbaseFieldsArray' is an array...
  2291. {
  2292. // ...we'll copy field data to different refbase fields depending on the current records reference type:
  2293. // (see also above note about '$tagsToRefbaseFieldsArray' requiring 'type' as the first element)
  2294. $useDefault = true;
  2295. foreach ($tagsToRefbaseFieldsArray[$fieldLabel] as $referenceType => $refbaseField)
  2296. {
  2297. if ($fieldParametersArray['type'] == $referenceType)
  2298. {
  2299. $fieldParametersArray[$refbaseField] = $fieldData;
  2300. $useDefault = false;
  2301. break;
  2302. }
  2303. }
  2304. if ($useDefault AND isset($tagsToRefbaseFieldsArray[$fieldLabel]['Other']))
  2305. $fieldParametersArray[$tagsToRefbaseFieldsArray[$fieldLabel]['Other']] = $fieldData;
  2306. }
  2307. }
  2308. }
  2309. // (END LOOP OVER EACH FIELD)
  2310. // POST-PROCESS FIELD DATA:
  2311. if (empty($showSource) AND isset($fieldParametersArray['source'])) // if we're NOT supposed to display the original source data
  2312. unset($fieldParametersArray['source']); // remove the special 'source' field from the array of fields
  2313. // merge individual items of fields that can occur multiple times:
  2314. foreach ($tagsMultipleArray as $tagMultiple)
  2315. {
  2316. if(!is_array($tagsToRefbaseFieldsArray[$tagMultiple]))
  2317. {
  2318. if (isset($tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]]))
  2319. $fieldParametersArray[$tagsToRefbaseFieldsArray[$tagMultiple]] = implode("; ", $tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]]);
  2320. }
  2321. else // if the current tag's value in '$tagsToRefbaseFieldsArray' is an array...
  2322. {
  2323. // ...we'll copy field data to different refbase fields depending on the current records reference type:
  2324. // (see also above note about '$tagsToRefbaseFieldsArray' requiring 'type' as the first element)
  2325. $useDefault = true;
  2326. foreach ($tagsToRefbaseFieldsArray[$tagMultiple] as $referenceType => $refbaseField)
  2327. {
  2328. if ($fieldParametersArray['type'] == $referenceType)
  2329. {
  2330. if (isset($tagContentsMultipleArray[$refbaseField]))
  2331. {
  2332. $fieldParametersArray[$refbaseField] = implode("; ", $tagContentsMultipleArray[$refbaseField]);
  2333. $useDefault = false;
  2334. break;
  2335. }
  2336. }
  2337. }
  2338. if ($useDefault AND isset($tagsToRefbaseFieldsArray[$tagMultiple]['Other']))
  2339. if (isset($tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]['Other']]))
  2340. $fieldParametersArray[$tagsToRefbaseFieldsArray[$tagMultiple]['Other']] = implode("; ", $tagContentsMultipleArray[$tagsToRefbaseFieldsArray[$tagMultiple]['Other']]);
  2341. }
  2342. }
  2343. // convert format-specific reference types into refbase format:
  2344. // (e.g. for the RIS format, convert "JOUR" into "Journal Article", etc)
  2345. if (isset($fieldParametersArray['type']))
  2346. $fieldParametersArray['type'] = searchReplaceText($referenceTypesToRefbaseTypesArray, $fieldParametersArray['type'], false); // function 'searchReplaceText()' is defined in 'include.inc.php'
  2347. // standardize field data contained in '$fieldParametersArray':
  2348. // (function 'standardizeFieldData()' e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference)
  2349. $fieldParametersArray = standardizeFieldData($fieldParametersArray, $recordFormat, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray);
  2350. // append the array of extracted field data to the main data array which holds all records to import:
  2351. $parsedRecordsArray[] = $fieldParametersArray;
  2352. }
  2353. }
  2354. // (END LOOP OVER EACH RECORD)
  2355. return array($parsedRecordsArray, $recordsCount);
  2356. }
  2357. // --------------------------------------------------------------------
  2358. // STANDARDIZE FIELD DATA
  2359. // This function standardizes field data contained in '$fieldParametersArray':
  2360. // (e.g. performs case transformation, standardizes thesis names, normalizes page ranges, and reformats person names according to preference)
  2361. function standardizeFieldData($fieldParametersArray, $recordFormat, $personDelimiter, $familyNameGivenNameDelimiter, $familyNameFirst, $shortenGivenNames, $transformCase, $postprocessorActionsArray)
  2362. {
  2363. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  2364. if (!empty($fieldParametersArray))
  2365. {
  2366. // perform case transformation:
  2367. // NOTE: this case transformation is kinda redundant if the record data were passed thru function 'parseRecords()' before,
  2368. // but we include it here since this function is also called individually (e.g. by function 'crossrefToRefbase()')
  2369. foreach ($fieldParametersArray as $fieldKey => $fieldData) // for each field within the current record...
  2370. {
  2371. // if all of the field data is in uppercase letters, we attempt to convert the string to something more readable:
  2372. if ($transformCase AND (!preg_match("/^(type|issn|url|doi)$/", $fieldKey))) // we exclude ISSN & DOI numbers, as well as URLs and reference types from any case transformations
  2373. // TODO: as above, we should probably only use Unicode-aware expressions here (i.e. something like "/^([$upper$digit]|[^$word])+$/$patternModifiers")
  2374. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $fieldData))
  2375. // convert upper case to title case (converts e.g. "ELSEVIER SCIENCE BV" into "Elsevier Science Bv"):
  2376. // (note that this case transformation won't do the right thing for author initials and abbreviations,
  2377. // but the result is better than the whole string being upper case, IMHO)
  2378. $fieldParametersArray[$fieldKey] = changeCase('title', $fieldData); // function 'changeCase()' is defined in 'include.inc.php'
  2379. }
  2380. if (preg_match("/Thesis/", $fieldParametersArray['type']))
  2381. {
  2382. $fieldParametersArray['type'] = "Book Whole";
  2383. // standardize thesis names:
  2384. if (isset($fieldParametersArray['thesis']))
  2385. {
  2386. if (preg_match("/^Master'?s?( thesis)?$/i", $fieldParametersArray['thesis']))
  2387. $fieldParametersArray['thesis'] = "Master's thesis";
  2388. elseif (preg_match("/^Bachelor'?s?( thesis)?$/i", $fieldParametersArray['thesis']))
  2389. $fieldParametersArray['thesis'] = "Bachelor's thesis";
  2390. elseif (preg_match("/^(Diploma( thesis)?|Dipl(om)?(arbeit)?)$/i", $fieldParametersArray['thesis']))
  2391. $fieldParametersArray['thesis'] = "Diploma thesis";
  2392. elseif (preg_match("/^(Doctoral( thesis)?|Diss(ertation)?|Doktor(arbeit)?)$/i", $fieldParametersArray['thesis']))
  2393. $fieldParametersArray['thesis'] = "Doctoral thesis";
  2394. elseif (preg_match("/^Habil(itation)?( thesis)?$/i", $fieldParametersArray['thesis']))
  2395. $fieldParametersArray['thesis'] = "Habilitation thesis";
  2396. else // if an unknown thesis name was given
  2397. $fieldParametersArray['thesis'] = "Ph.D. thesis"; // NOTE: this fallback may actually be not correct!
  2398. }
  2399. else // if no thesis info was given
  2400. $fieldParametersArray['thesis'] = "Ph.D. thesis"; // NOTE: this fallback may actually be not correct!
  2401. }
  2402. // merge contents of the special fields 'startPage' and 'endPage' into a range and copy it to the 'pages' field:
  2403. // (these special fields will be then removed again from the '$fieldParametersArray' since they aren't valid refbase field names)
  2404. if (isset($fieldParametersArray['startPage']) OR isset($fieldParametersArray['endPage']))
  2405. {
  2406. $pages = array();
  2407. if (isset($fieldParametersArray['startPage']))
  2408. {
  2409. if (!empty($fieldParametersArray['startPage']))
  2410. $pages[] = $fieldParametersArray['startPage'];
  2411. unset($fieldParametersArray['startPage']);
  2412. }
  2413. if (isset($fieldParametersArray['endPage']))
  2414. {
  2415. if (!empty($fieldParametersArray['endPage']))
  2416. $pages[] = $fieldParametersArray['endPage'];
  2417. unset($fieldParametersArray['endPage']);
  2418. }
  2419. if (!empty($pages))
  2420. $fieldParametersArray['pages'] = implode("-", $pages);
  2421. if (preg_match("/Book Whole/", $fieldParametersArray['type']) AND preg_match("/^\d+$/", $fieldParametersArray['pages']))
  2422. $fieldParametersArray['pages'] = $fieldParametersArray['pages'] . " pp"; // append "pp" identifier for whole books where the pages field contains a single number
  2423. }
  2424. // if the 'pages' field contains a page range, verify that the end page is actually greater than the start page:
  2425. // TODO: - make regex patterns Unicode-aware (e.g. use '$punct' instead of '-')
  2426. // - can this be standardized with function 'formatPageInfo()' in 'cite.inc.php'?
  2427. if (isset($fieldParametersArray['pages']) AND preg_match("/^\d+\D*-\D*\d+$/", $fieldParametersArray['pages']))
  2428. {
  2429. list($startPage, $endPage) = preg_split("/\D*-\D*/", $fieldParametersArray['pages']);
  2430. $countStartPage = strlen($startPage);
  2431. $countEndPage = strlen($endPage);
  2432. if(($countStartPage > $countEndPage) AND ($startPage > $endPage))
  2433. {
  2434. $startPagePart = preg_replace("/^.*?(\d{" . $countEndPage . "})$/", "\\1", $startPage);
  2435. if ($startPagePart < $endPage)
  2436. $fieldParametersArray['pages'] = $startPage . "-" . ($startPage + ($endPage - $startPagePart)); // convert page ranges such as '673-6' or '673-85' to '673-676' or '673-685', respectively
  2437. }
  2438. }
  2439. // standardize contents of the 'author', 'editor' and 'series_editor' fields:
  2440. if (!empty($fieldParametersArray['author']) OR !empty($fieldParametersArray['editor']) OR !empty($fieldParametersArray['series_editor']))
  2441. {
  2442. $namesArray = array();
  2443. if (!empty($fieldParametersArray['author']))
  2444. $namesArray['author'] = $fieldParametersArray['author'];
  2445. if (!empty($fieldParametersArray['editor']))
  2446. $namesArray['editor'] = $fieldParametersArray['editor'];
  2447. if (!empty($fieldParametersArray['series_editor']))
  2448. $namesArray['series_editor'] = $fieldParametersArray['series_editor'];
  2449. if (!empty($namesArray))
  2450. foreach ($namesArray as $nameKey => $nameString)
  2451. $fieldParametersArray[$nameKey] = standardizePersonNames($nameString, $familyNameFirst, $personDelimiter, $familyNameGivenNameDelimiter, $shortenGivenNames);
  2452. }
  2453. // if the 'author' field is empty BUT the 'editor' field is not empty AND the record type is either a container item or a self-contained/independent item (such as 'Book Whole', 'Journal', 'Manuscript' or 'Map'):
  2454. if (empty($fieldParametersArray['author']) AND !empty($fieldParametersArray['editor']) AND preg_match("/^(Book Whole|Conference Volume|Journal|Manual|Manuscript|Map|Miscellaneous|Patent|Report|Software)$/", $fieldParametersArray['type']))
  2455. {
  2456. $fieldParametersArray['author'] = $fieldParametersArray['editor']; // duplicate field contents from 'editor' to 'author' field
  2457. if (!preg_match("/;/", $fieldParametersArray['author'])) // if the 'author' field does NOT contain a ';' (which would delimit multiple authors) => single author
  2458. $fieldParametersArray['author'] .= " (ed)"; // append " (ed)" to the end of the 'author' string
  2459. else // the 'author' field does contain at least one ';' => multiple authors
  2460. $fieldParametersArray['author'] .= " (eds)"; // append " (eds)" to the end of the 'author' string
  2461. }
  2462. // if some (full or abbreviated) series title was given, we assume that the information given in 'volume'/'issue' is actually the 'series_volume'/'series_issue':
  2463. if (!empty($fieldParametersArray['series_title']) OR !empty($fieldParametersArray['abbrev_series_title']))
  2464. {
  2465. if (!empty($fieldParametersArray['volume']) AND empty($fieldParametersArray['series_volume'])) // move 'volume' to 'series_volume'
  2466. {
  2467. $fieldParametersArray['series_volume'] = $fieldParametersArray['volume'];
  2468. unset($fieldParametersArray['volume']);
  2469. }
  2470. if (!empty($fieldParametersArray['issue']) AND empty($fieldParametersArray['series_issue'])) // move 'issue' to 'series_issue'
  2471. {
  2472. $fieldParametersArray['series_issue'] = $fieldParametersArray['issue'];
  2473. unset($fieldParametersArray['issue']);
  2474. }
  2475. }
  2476. // if the 'url' field actually contains a DOI prefixed with "http://dx.doi.org/" (AND the 'doi' field is empty), we'll extract the DOI and move it to the 'doi' field:
  2477. if (!empty($fieldParametersArray['url']) AND empty($fieldParametersArray['doi']) AND preg_match("#(?<=^|; )http://dx\.doi\.org/10\.\d{4}/\S+?(?=$|; )#", $fieldParametersArray['url']))
  2478. {
  2479. $fieldParametersArray['doi'] = preg_replace("#(?:.+?; )?http://dx\.doi\.org/(10\.\d{4}/\S+?)(?=$|; ).*#", "\\1", $fieldParametersArray['url']); // extract DOI to 'doi' field
  2480. $fieldParametersArray['url'] = preg_replace("#^http://dx\.doi\.org/10\.\d{4}/\S+?(?=$|; )(; )?#", "", $fieldParametersArray['url']); // remove DOI URL from beginning of 'url' field
  2481. $fieldParametersArray['url'] = preg_replace("#(; )?http://dx\.doi\.org/10\.\d{4}/\S+?(?=$|; )#", "", $fieldParametersArray['url']); // remove DOI URL from middle (or end) of 'url' field
  2482. if (empty($fieldParametersArray['url'])) // the DOI URL was the only URL given
  2483. unset($fieldParametersArray['url']);
  2484. }
  2485. if (!empty($fieldParametersArray['url'])) // besides any DOI URL, some other URL(s) were given
  2486. $fieldParametersArray['url'] = preg_replace("/^([^ ]+?)(?=$|; ).*/", "\\1", $fieldParametersArray['url']); // remove everything but the first URL from the 'url' field (currently, refbase does only support one URL per record)
  2487. // standardize format of ISSN number:
  2488. if (!empty($fieldParametersArray['issn']) AND preg_match("/^ *\d{4}\D*\d{4} *$/", $fieldParametersArray['issn']))
  2489. {
  2490. $fieldParametersArray['issn'] = preg_replace("/^ *(\d{4})\D*(\d{4}) *$/", "\\1-\\2", $fieldParametersArray['issn']);
  2491. }
  2492. // apply search & replace 'actions' to all fields that are listed in the 'fields' element of the arrays contained in '$postprocessorActionsArray':
  2493. foreach ($postprocessorActionsArray as $fieldActionsArray)
  2494. foreach ($fieldParametersArray as $fieldName => $fieldValue)
  2495. if (in_array($fieldName, $fieldActionsArray['fields']))
  2496. $fieldParametersArray[$fieldName] = searchReplaceText($fieldActionsArray['actions'], $fieldValue, true); // function 'searchReplaceText()' is defined in 'include.inc.php'
  2497. // if (except for a DOI) no other URL(s) are given AND the 'notes' field contains a PubMed ID, we extract the
  2498. // PubMed ID and copy a resolvable URL (that points to the PubMed article's abstract page) to the 'url' field:
  2499. if (!isset($fieldParametersArray['url']) AND isset($fieldParametersArray['notes']) AND preg_match("/PMID *: *\d+/i", $fieldParametersArray['notes']))
  2500. $fieldParametersArray['url'] = "http://www.ncbi.nlm.nih.gov/pubmed/" . preg_replace("/.*?PMID *: *(\d+).*/i", "\\1", $fieldParametersArray['notes']);
  2501. }
  2502. return $fieldParametersArray;
  2503. }
  2504. // --------------------------------------------------------------------
  2505. // STANDARDIZE PERSON NAMES
  2506. // This function is currently a wrapper for the 'reArrangeAuthorContents()' function that is used by several import routines.
  2507. // The function standardizes the contents of the 'author', 'editor' and 'series_editor' fields and features removal of
  2508. // extra whitespace, re-arranging of family and given names, abbreviation of given names, adding of dots between initials, etc.
  2509. function standardizePersonNames($nameString, $familyNameFirst, $personDelimiter, $familyNameGivenNameDelimiter, $shortenGivenNames)
  2510. {
  2511. // Call the 'reArrangeAuthorContents()' function (defined in 'include.inc.php') in order to re-order contents of the 'author', 'editor' or 'series_editor' field. Required Parameters:
  2512. // 1. input: contents of the author field
  2513. // 2. input: boolean value that specifies whether the author's family name comes first (within one author) in the source string
  2514. // ('true' means that the family name is followed by the given name (or initials), 'false' if it's the other way around)
  2515. //
  2516. // 3. input: pattern describing old delimiter that separates different authors
  2517. // 4. output: for all authors except the last author: new delimiter that separates different authors
  2518. // 5. output: for the last author: new delimiter that separates the last author from all other authors
  2519. //
  2520. // 6. input: pattern describing old delimiter that separates author name & initials (within one author)
  2521. // 7. output: for the first author: new delimiter that separates author name & initials (within one author)
  2522. // 8. output: for all authors except the first author: new delimiter that separates author name & initials (within one author)
  2523. // 9. output: new delimiter that separates multiple initials (within one author)
  2524. // 10. output: for the first author: boolean value that specifies if initials go *before* the author's name ['true'], or *after* the author's name ['false'] (which is the default in the db)
  2525. // 11. output: for all authors except the first author: boolean value that specifies if initials go *before* the author's name ['true'], or *after* the author's name ['false'] (which is the default in the db)
  2526. // 12. output: boolean value that specifies whether an author's full given name(s) shall be shortened to initial(s)
  2527. //
  2528. // 13. output: if the total number of authors is greater than the given number (integer >= 1), only the number of authors given in (14) will be included in the citation along with the string given in (15); keep empty if all authors shall be returned
  2529. // 14. output: number of authors (integer >= 1) that is included in the citation if the total number of authors is greater than the number given in (13); keep empty if not applicable
  2530. // 15. output: string that's appended to the number of authors given in (14) if the total number of authors is greater than the number given in (13); the actual number of authors can be printed by including '__NUMBER_OF_AUTHORS__' (without quotes) within the string
  2531. //
  2532. // 16. output: boolean value that specifies whether the re-ordered string shall be returned with higher ASCII chars HTML encoded
  2533. $reorderedNameString = reArrangeAuthorContents($nameString, // 1.
  2534. $familyNameFirst, // 2.
  2535. $personDelimiter, // 3.
  2536. "; ", // 4.
  2537. "; ", // 5.
  2538. $familyNameGivenNameDelimiter, // 6.
  2539. ", ", // 7.
  2540. ", ", // 8.
  2541. ".", // 9.
  2542. false, // 10.
  2543. false, // 11.
  2544. $shortenGivenNames, // 12.
  2545. "", // 13.
  2546. "", // 14.
  2547. "", // 15.
  2548. false); // 16.
  2549. return $reorderedNameString;
  2550. }
  2551. // --------------------------------------------------------------------
  2552. // BUILD IMPORT ARRAY
  2553. // This function builds an array structure that can be passed to the 'addRecords()' function for import:
  2554. // (for a more detailed explanation of the required array structure, see the comments above the
  2555. // 'addRecords()' function in 'include.inc.php')
  2556. function buildImportArray($type, $version, $creator, $author, $contact, $options, $parsedRecordsArray)
  2557. {
  2558. $importDataArray = array();
  2559. $importDataArray['type'] = $type; // the array format of the 'records' element
  2560. $importDataArray['version'] = $version; // the version of the given array structure
  2561. $importDataArray['creator'] = $creator; // the name of the script/importer (preferably given as unique URI)
  2562. $importDataArray['author'] = $author; // author/contact name of the person who's responsible for this script/importer
  2563. $importDataArray['contact'] = $contact; // author's email/contact address
  2564. $importDataArray['options'] = $options; // array with settings that control the behaviour of the 'addRecords()' function
  2565. $importDataArray['records'] = $parsedRecordsArray; // array of record(s) (with each record being a sub-array of fields)
  2566. // NOTES:
  2567. // - the 'addRecords()' function will take care of the calculation fields ('first_author', 'author_count', 'first_page',
  2568. // 'volume_numeric' and 'series_volume_numeric')
  2569. //
  2570. // - similarly, the *date/*time/*by fields ('created_date', 'created_time', 'created_by', 'modified_date', 'modified_time' and
  2571. // 'modified_by') will be filled automatically if no custom values (in correct date ['YYYY-MM-DD'] and time ['HH:MM:SS'] format)
  2572. // are given in the '$importDataArray'
  2573. //
  2574. // - we could pass any custom info for the 'location' field with the '$importDataArray', omitting it here
  2575. // causes the 'addRecords()' function to insert name & email address of the currently logged-in user
  2576. // (e.g. 'Matthias Steffens (refbase@extracts.de)')
  2577. //
  2578. // - if the 'prefix_call_number' element of the 'options' array is set to "true", any 'call_number' string will be prefixed with
  2579. // the correct call number prefix of the currently logged-in user (e.g. 'IP� @ msteffens @ ')
  2580. //
  2581. // - the serial number(s) will be assigned automatically and returned by the 'addRecords()' function in form of an array
  2582. return $importDataArray;
  2583. }
  2584. // --------------------------------------------------------------------
  2585. // This function takes a BibTeX source and converts any contained
  2586. // LaTeX/BibTeX markup into proper refbase markup:
  2587. function standardizeBibtexInput($bibtexSourceText)
  2588. {
  2589. global $contentTypeCharset; // defined in 'ini.inc.php'
  2590. // The array '$transtab_bibtex_refbase' contains search & replace patterns for conversion from LaTeX/BibTeX markup & entities to refbase markup.
  2591. // Converts LaTeX fontshape markup (italic, bold) into appropriate refbase commands, super- and subscript as well as greek letters in math mode
  2592. // get converted into the respective refbase commands. You may need to adopt the LaTeX markup to suit your individual needs.
  2593. global $transtab_bibtex_refbase; // defined in 'transtab_bibtex_refbase.inc.php'
  2594. // The arrays '$transtab_latex_latin1' and '$transtab_latex_unicode' provide translation tables for best-effort conversion of higher ASCII
  2595. // characters from LaTeX markup to ISO-8859-1 entities (or Unicode, respectively).
  2596. global $transtab_latex_latin1; // defined in 'transtab_latex_latin1.inc.php'
  2597. global $transtab_latex_unicode; // defined in 'transtab_latex_unicode.inc.php'
  2598. // Perform search & replace actions on the given BibTeX text:
  2599. $bibtexSourceText = searchReplaceText($transtab_bibtex_refbase, $bibtexSourceText, true); // function 'searchReplaceText()' is defined in 'include.inc.php'
  2600. // Attempt to convert LaTeX markup for higher ASCII chars to their corresponding ISO-8859-1/Unicode entities:
  2601. if ($contentTypeCharset == "UTF-8")
  2602. $bibtexSourceText = searchReplaceText($transtab_latex_unicode, $bibtexSourceText, false);
  2603. else
  2604. $bibtexSourceText = searchReplaceText($transtab_latex_latin1, $bibtexSourceText, false);
  2605. return $bibtexSourceText;
  2606. }
  2607. // --------------------------------------------------------------------
  2608. // This function takes an Endnote XML source and converts any contained
  2609. // text style markup into proper refbase markup:
  2610. function standardizeEndnoteXMLInput($endxSourceText)
  2611. {
  2612. // The array '$transtab_endnotexml_refbase' contains search & replace patterns for conversion from Endnote XML text style markup to refbase markup.
  2613. // It attempts to convert fontshape markup (italic, bold) as well as super- and subscript into appropriate refbase markup.
  2614. global $transtab_endnotexml_refbase; // defined in 'transtab_endnotexml_refbase.inc.php'
  2615. // Perform search & replace actions on the given Endnote XML source text:
  2616. $endxSourceText = searchReplaceText($transtab_endnotexml_refbase, $endxSourceText, true); // function 'searchReplaceText()' is defined in 'include.inc.php'
  2617. return $endxSourceText;
  2618. }
  2619. // --------------------------------------------------------------------
  2620. // This function fetches source data from PubMed.gov for all PubMed IDs
  2621. // given in '$pmidArray':
  2622. // ('$sourceFormat' must be either "Pubmed Medline" or "Pubmed XML";
  2623. // more info on the Entrez Programming Utilities:
  2624. // <http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html>)
  2625. function fetchDataFromPubMed($pmidArray, $sourceFormat = "Pubmed Medline")
  2626. {
  2627. global $errors;
  2628. $sourceText = "";
  2629. if (!empty($pmidArray))
  2630. {
  2631. // Remove any duplicate PubMed IDs:
  2632. $pmidArray = array_unique($pmidArray);
  2633. // Define response format:
  2634. if (preg_match("/^Pubmed XML$/i", $sourceFormat))
  2635. $fetchType = "xml";
  2636. else // by default, we'll use the "Pubmed Medline" format
  2637. $fetchType = "text";
  2638. // NOTE:
  2639. // When querying PubMed for multiple PubMed IDs *at once*, errors are not
  2640. // returned inline on a per-record basis. If one or more of the given
  2641. // PubMed IDs are invalid, PubMed returns a single error message, if the
  2642. // first given PubMed ID is invalid, otherwise it returns records until
  2643. // the first invalid ID is encountered. In any case, the remaining records
  2644. // seem to get omitted from the PubMed response.
  2645. // To work around this, we'll query PubMed for each given PubMed ID
  2646. // *individually* (similar to function 'fetchDataFromCrossRef()'), and we
  2647. // then perform the record validation (i.e. error checking) in function
  2648. // 'validateRecords()'.
  2649. // See below for alternative code that fetches PubMed records via a single
  2650. // HTTP request.
  2651. foreach ($pmidArray as $pmid)
  2652. {
  2653. // Build query URL:
  2654. $sourceURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
  2655. . "?db=pubmed"
  2656. . "&retmode=" . $fetchType
  2657. . "&rettype=medline"
  2658. . "&tool=refbase"
  2659. . "&email=" . rawurlencode("info@refbase.net")
  2660. . "&id=" . $pmid;
  2661. // Perform query:
  2662. $sourceText .= fetchDataFromURL($sourceURL); // function 'fetchDataFromURL()' is defined in 'include.inc.php'
  2663. }
  2664. // Alternative code that fetches PubMed records via a single HTTP request:
  2665. // (while this may be more efficient, it prevents us from checking errors
  2666. // on a per-record level -- see note above)
  2667. // // Merge PubMed IDs with commas:
  2668. // $sourceIDs = implode(",", $pmidArray);
  2669. //
  2670. // // Build query URL:
  2671. // $sourceURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
  2672. // . "?db=pubmed"
  2673. // . "&retmode=" . $fetchType
  2674. // . "&rettype=medline"
  2675. // . "&tool=refbase"
  2676. // . "&email=" . rawurlencode("info@refbase.net")
  2677. // . "&id=" . $sourceIDs;
  2678. //
  2679. // // Perform query:
  2680. // $sourceText = fetchDataFromURL($sourceURL);
  2681. //
  2682. // // Handle errors:
  2683. // if (!preg_match("/^PMID- /m", $sourceText) AND preg_match("/Error occurred:/", $sourceText)) // a PubMed error occurred, probably because only unrecognized PubMed IDs were given; TODO: handle PubMed XML
  2684. // $errors["sourceText"] = preg_replace("/.*Error occurred: *([^<>]+).*/s", "PubMed error: \\1", $sourceText); // attempt to extract PubMed error message
  2685. }
  2686. return array($errors, $sourceText);
  2687. }
  2688. // --------------------------------------------------------------------
  2689. // This function fetches record metadata from arXiv.org for all arXiv IDs
  2690. // given in '$itemArray':
  2691. // (for '$sourceFormat', only "arXiv XML", i.e. the arXiv.org Atom XML OpenSearch format,
  2692. // is currently supported; more info on the arXiv API:
  2693. // <http://export.arxiv.org/api_help/>
  2694. // <http://export.arxiv.org/api_help/docs/user-manual.html>
  2695. //
  2696. // Requires the SimplePie library (by Ryan Parman and Geoffrey Sneddon), which is
  2697. // available under the BSD license from: <http://simplepie.org>
  2698. function fetchDataFromArXiv($itemArray, $sourceFormat = "arXiv XML")
  2699. {
  2700. global $errors; // NOTE: ATM, error checking is done in function 'arxivToRefbase()'
  2701. $sourceURLArray = array();
  2702. if (!empty($itemArray))
  2703. {
  2704. // Remove any duplicate IDs:
  2705. $itemArray = array_unique($itemArray);
  2706. // NOTE:
  2707. // When querying arXiv.org for multiple arXiv IDs *at once*, errors are not
  2708. // returned inline on a per-record basis. If one or more of the given
  2709. // arXiv IDs are invalid, arXiv.org returns a *single* error message, and
  2710. // any other requested records seem to get omitted from the arXiv response.
  2711. // To work around this, we'll query arXiv.org for each given arXiv ID
  2712. // *individually*, and we then perform the record validation (i.e. error
  2713. // checking) in function 'arxivToRefbase()'.
  2714. foreach ($itemArray as $item)
  2715. {
  2716. // if (preg_match("#(arXiv:|http://arxiv\.org/abs/)?([\w.-]+/\d{7}|\d{4}\.\d{4,})(v\d+)?#i", $item)) // '$item' is an arXiv ID
  2717. // {
  2718. // Build query URL:
  2719. $sourceURLArray[] = "http://export.arxiv.org/api/query"
  2720. . "?id_list=" . rawurlencode($item);
  2721. // }
  2722. }
  2723. // Perform query:
  2724. $feed = new SimplePie(); // setup new SimplePie constructor
  2725. $feed->set_feed_url($sourceURLArray); // setup multi-feed request
  2726. $feed->set_input_encoding('UTF-8'); // force UTF-8 as input encoding
  2727. $feed->enable_cache(false); // disable caching
  2728. $feed->enable_order_by_date(false); // disable automatic sorting of entries by date
  2729. $feed->init(); // process options, fetch feeds, cache, parse, merge, etc
  2730. }
  2731. return array($errors, $feed);
  2732. }
  2733. // --------------------------------------------------------------------
  2734. // This function tries to fetch PubMed IDs from PubMed.gov for all DOIs given
  2735. // in '$doiArray':
  2736. //
  2737. // NOTE: The function 'SimpleXMLElement()' requires the SimpleXML extension which,
  2738. // in turn, requires PHP 5 compiled with the --enable-libxml option.
  2739. //
  2740. // Author: Nicholaus Lance Hepler <mailto:nhelper@gmail.com>
  2741. function fetchDOIsFromPubMed($doiArray, $sourceFormat = "CrossRef XML")
  2742. {
  2743. global $errors;
  2744. $sourceText = "";
  2745. $pmidArray = array();
  2746. $failedIDs = array();
  2747. if (!empty($doiArray))
  2748. {
  2749. // Remove any duplicate IDs:
  2750. $doiArray = array_unique($doiArray);
  2751. foreach ($doiArray as $doi)
  2752. {
  2753. // Build query URL:
  2754. $sourceURL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
  2755. . "?db=pubmed"
  2756. . "&retmax=1"
  2757. . "&field=doi"
  2758. . "&term=" . $doi;
  2759. // Perform query:
  2760. $esearchText = fetchDataFromURL($sourceURL);
  2761. $xml = new SimpleXMLElement($esearchText); // requires PHP 5 with --enable-libxml
  2762. if ($xml->Count != 1 || (isset($xml->ErrorList->PhraseNotFound) && !empty($xml->ErrorList->PhraseNotFound)))
  2763. {
  2764. $failedIDs[] = $doi;
  2765. }
  2766. else
  2767. {
  2768. // Extract PubMed ID:
  2769. $pmidArray[] = $xml->IdList->Id[0];
  2770. }
  2771. }
  2772. }
  2773. if (!empty($failedIDs))
  2774. {
  2775. $failedIDs = array_merge($failedIDs, $pmidArray);
  2776. }
  2777. else
  2778. {
  2779. // Fetch source data from PubMed.gov for all found PubMed IDs:
  2780. list($errors, $sourceText) = fetchDataFromPubMed($pmidArray);
  2781. }
  2782. return array($errors, $sourceText, $failedIDs);
  2783. }
  2784. // --------------------------------------------------------------------
  2785. // This function tries to fetch record metadata from CrossRef.org for all DOIs or
  2786. // OpenURLs given in '$itemArray':
  2787. // (for '$sourceFormat', only "CrossRef XML", i.e. the CrossRef "unixref XML" format,
  2788. // is currently supported; more info on the CrossRef OpenURL resolver/metadata server:
  2789. // <http://www.crossref.org/openurl>
  2790. // see also: <http://hublog.hubmed.org/archives/001624.html>)
  2791. function fetchDataFromCrossRef($itemArray, $sourceFormat = "CrossRef XML")
  2792. {
  2793. global $errors;
  2794. global $crossRefReqDat;
  2795. $sourceText = "";
  2796. if (!empty($itemArray))
  2797. {
  2798. // Remove any duplicate IDs:
  2799. $itemArray = array_unique($itemArray);
  2800. // Define response format:
  2801. // if (preg_match("/^CrossRef XML$/i", $sourceFormat))
  2802. // $fetchType = "unixref";
  2803. // else // by default, we'll use the "unixref XML" format
  2804. $fetchType = "unixref";
  2805. foreach ($itemArray as $item)
  2806. {
  2807. // Build query URL:
  2808. $sourceURL = "http://www.crossref.org/openurl/"
  2809. . "?noredirect=true"
  2810. . "&format=" . $fetchType;
  2811. if (!empty($crossRefReqDat))
  2812. $sourceURL .= "&pid=" . rawurlencode($crossRefReqDat);
  2813. if (preg_match("#^10\.\d{4}/\S+$#", $item)) // '$item' is a DOI
  2814. $sourceURL .= "&id=" . rawurlencode("doi:" . $item);
  2815. else // otherwise we assume a full OpenURL context object // TODO: verify OpenURL!?
  2816. $sourceURL .= "&" . $item;
  2817. // Perform query:
  2818. $sourceText .= fetchDataFromURL($sourceURL); // function 'fetchDataFromURL()' is defined in 'include.inc.php'
  2819. }
  2820. }
  2821. return array($errors, $sourceText);
  2822. }
  2823. // --------------------------------------------------------------------
  2824. // CSA TO REFBASE
  2825. // This function converts records from Cambridge Scientific Abstracts (CSA) into the standard "refbase"
  2826. // array format which can be then imported by the 'addRecords()' function in 'include.inc.php'.
  2827. function csaToRefbase($sourceText, $importRecordsRadio, $importRecordNumbersArray)
  2828. {
  2829. global $alnum, $alpha, $cntrl, $dash, $digit, $graph, $lower, $print, $punct, $space, $upper, $word, $patternModifiers; // defined in 'transtab_unicode_charset.inc.php' and 'transtab_latin1_charset.inc.php'
  2830. global $errors;
  2831. global $showSource;
  2832. // Defines the pattern by which the input text will be split into individual records:
  2833. $recordDelimiter = "\s*Record \d+ of \d+\s*";
  2834. // PRE-PROCESS SOURCE TEXT:
  2835. // Split input text into individual records:
  2836. $recordArray = splitSourceText($sourceText, $recordDelimiter, false); // split input text on the header text preceeding each CSA record (e.g. "\nRecord 4 of 52\n")
  2837. // Count how many records are available:
  2838. $recordsCount = count($recordArray);
  2839. // ----------------------------------------------------------------
  2840. // VALIDATE INDIVIDUAL RECORDS:
  2841. // Note that source data must begin with "\nRecord x of xx\n" and that (opposed to the handling in 'import_csa_modify.php') any text preceeding the source data isn't removed but treated as the first record!
  2842. // This array lists patterns which match all CSA tags that must occur within a record to be recognized as valid CSA record:
  2843. // (Array keys must contain the tag name as it should be displayed to the user; as is the case with search & replace actions,
  2844. // the search patterns MUST include the leading & trailing slashes.)
  2845. // "tag display name" => "tag search pattern"
  2846. $requiredTagsArray = array(
  2847. "title" => "/^TI: Title *[\r\n]+ {4,4}/m",
  2848. "author (or editor)" => "/^(AU: Author|ED: Editor) *[\r\n]+ {4,4}/m",
  2849. "source" => "/^SO: Source *[\r\n]+ {4,4}/m" // since the "SO: Source" is also specified as format requirement in function 'identifySourceFormat()' records without "SO: Source" won't be recognized anyhow
  2850. );
  2851. // Validate all records that shall be imported:
  2852. list($errors, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray) = validateRecords($recordArray, $requiredTagsArray, $importRecordsRadio, $importRecordNumbersArray, $errors);
  2853. // ----------------------------------------------------------------
  2854. // PROCESS SOURCE DATA:
  2855. $parsedRecordsArray = array(); // initialize array variable which will hold parsed data of all records that shall be imported
  2856. // LOOP OVER EACH RECORD:
  2857. for ($i=0; $i<$recordsCount; $i++) // for each record...
  2858. {
  2859. // if we're NOT supposed to import this record (because it was either not selected by the user -OR- because it did contain an unrecognized data format)
  2860. if (!in_array(($i+1), $importRecordNumbersRecognizedFormatArray)) // '$i' starts with 0 so we have to add 1 to point to the correct record number
  2861. {
  2862. continue; // process next record (if any)
  2863. }
  2864. else // ...import the current record:
  2865. {
  2866. $singleRecord = $recordArray[$i];
  2867. // if the "AU: Author" field is missing BUT the "ED: Editor" is present (which is allowed for book monographs):
  2868. // we replace the "ED: Editor" field identifier with "AU: Author" (this will keep any " (ed)" and " (eds)" tags in place which, in turn, will cause the "is Editor" checkbox in 'record.php' to get marked)
  2869. if (!preg_match("/^AU: Author *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/^ED: Editor *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord))
  2870. $singleRecord = preg_replace("/^ED: Editor(?= *[\r\n]+ {4,4})/m", "AU: Author", $singleRecord);
  2871. // split each record into its fields:
  2872. $fieldArray = preg_split("/[\r\n]+(?=\w\w: )/", $singleRecord);
  2873. // initialize some variables:
  2874. $fieldParametersArray = array(); // setup an empty array (it will hold all fields that were extracted for a given record)
  2875. $additionalDocumentTypeInfo = ""; // will be used with the "PT: Publication Type" field
  2876. $environmentalRegime = ""; // will be used with the "ER: Environmental Regime" field
  2877. // GENERATE EXTRA FIELDS:
  2878. // check if the fields "MT: Monograph Title", "JN: Journal Name", "JV: Journal Volume", "JI: Journal Issue" and "JP: Journal Pages" are present,
  2879. // if not, we attempt to generate them from the "SO: Source" field:
  2880. $sourceField = preg_replace("/.*SO: Source *[\r\n]+ {4,4}(.+?)(?=([\r\n]+\w\w: |\s*\z)).*/ms", "\\1", $singleRecord); // first, we need to extract the "SO: Source" field data from the record text
  2881. $sourceField = preg_replace("/\s{2,}/", " ", $sourceField); // remove any hard returns and extra spaces within the source field data string
  2882. // if the current record is of type "Book Monograph" but the field "MT: Monograph Title" is missing:
  2883. if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord) AND !preg_match("/^MT: Monograph Title *[\r\n]+ {4,4}/m", $singleRecord))
  2884. {
  2885. $extractedSourceFieldData = preg_replace("/^([^.[]+).*/", "\\1", $sourceField); // attempt to extract the full monograph title from the source field
  2886. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $extractedSourceFieldData)) // if all of the words within the monograph title are uppercase, we attempt to convert the string to something more readable:
  2887. // perform case transformation (e.g. convert "BIOLOGY AND ECOLOGY OF GLACIAL RELICT CRUSTACEA" into "Biology And Ecology Of Glacial Relict Crustacea")
  2888. $extractedSourceFieldData = changeCase('title', $extractedSourceFieldData); // function 'changeCase()' is defined in 'include.inc.php'
  2889. $fieldArray[] = "MT: Monograph Title\r\n " . $extractedSourceFieldData; // add field "MT: Monograph Title" to the array of fields
  2890. }
  2891. // else if the current record is of type "Journal Article", "Report", etc (or wasn't specified) but the field "JN: Journal Name" is missing:
  2892. elseif (!preg_match("/^JN: Journal Name *[\r\n]+ {4,4}/m", $singleRecord)) // preg_match("/^(PT: Publication Type\s+(Journal Article|Report)|DT: Document Type\s+(J|R))/m", $singleRecord)
  2893. {
  2894. if (preg_match("/\[/", $sourceField)) // if the source field data contain a square bracket we assume a format like: "Journal of Phycology [J. Phycol.]. Vol. 37, no. s3, pp. 18-18. Jun 2001."
  2895. $extractedSourceFieldData = preg_replace("/^([^.[]+).*/", "\\1", $sourceField); // attempt to extract the full journal name from the source field
  2896. else // source field format might be something like: "Phycologia, vol. 34, no. 2, pp. 135-144, 1995"
  2897. $extractedSourceFieldData = preg_replace("/^([^.,]+).*/", "\\1", $sourceField); // attempt to extract the full journal name from the source field
  2898. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $extractedSourceFieldData)) // if all of the words within the journal name are uppercase, we attempt to convert the string to something more readable:
  2899. // perform case transformation (e.g. convert "POLAR BIOLOGY" into "Polar Biology")
  2900. $extractedSourceFieldData = changeCase('title', $extractedSourceFieldData);
  2901. $fieldArray[] = "JN: Journal Name\r\n " . $extractedSourceFieldData; // add field "JN: Journal Name" to the array of fields
  2902. }
  2903. // if the "JV: Journal Volume" is missing BUT the "SO: Source" field contains a volume specification:
  2904. if (!preg_match("/^JV: Journal Volume *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/(?<=\W)vol[. ]+[\w\/-]+/i", $sourceField))
  2905. {
  2906. $extractedSourceFieldData = preg_replace("/.*(?<=\W)vol[. ]+([\w\/-]+).*/i", "\\1", $sourceField); // attempt to extract the journal volume from the source field
  2907. $fieldArray[] = "JV: Journal Volume\r\n " . $extractedSourceFieldData; // add field "JV: Journal Volume" to the array of fields
  2908. }
  2909. // if the "JI: Journal Issue" is missing BUT the "SO: Source" field contains an issue specification:
  2910. if (!preg_match("/^JI: Journal Issue *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/(?<=\W)no[. ]+[\w\/-]+/i", $sourceField))
  2911. {
  2912. $extractedSourceFieldData = preg_replace("/.*(?<=\W)no[. ]+([\w\/-]+).*/i", "\\1", $sourceField); // attempt to extract the journal issue from the source field
  2913. $fieldArray[] = "JI: Journal Issue\r\n " . $extractedSourceFieldData; // add field "JI: Journal Issue" to the array of fields
  2914. }
  2915. // if the "JP: Journal Pages" is missing BUT the "SO: Source" field contains a pages specification:
  2916. if (!preg_match("/^JP: Journal Pages *[\r\n]+ {4,4}/m", $singleRecord) AND preg_match("/((?<=\W)pp?[. ]+[\w\/,-]+|[\d,]+ *pp\b)/i", $sourceField))
  2917. {
  2918. if (preg_match("/(?<=\W)pp?[. ]+[\w\/,-]+/i", $sourceField)) // e.g. "pp. 212-217" or "p. 216" etc
  2919. $extractedSourceFieldData = preg_replace("/.*(?<=\W)pp?[. ]+([\w\/,-]+).*/i", "\\1", $sourceField); // attempt to extract the journal pages from the source field
  2920. elseif (preg_match("/[\d,]+ *pp\b/", $sourceField)) // e.g. "452 pp"
  2921. $extractedSourceFieldData = preg_replace("/.*?([\d,]+ *pp)\b.*/i", "\\1", $sourceField); // attempt to extract the journal pages from the source field
  2922. $extractedSourceFieldData = preg_replace("/,/", "", $extractedSourceFieldData); // remove any thousands separators from journal pages
  2923. $fieldArray[] = "JP: Journal Pages\r\n " . $extractedSourceFieldData; // add field "JP: Journal Pages" to the array of fields
  2924. }
  2925. // Additionally, we extract the abbreviated journal name from the "SO: Source" field (if available):
  2926. if (preg_match("/\[/", $sourceField)) // if the source field data contain a square bracket we assume a format like: "Journal of Phycology [J. Phycol.]. Vol. 37, no. s3, pp. 18-18. Jun 2001."
  2927. {
  2928. $extractedSourceFieldData = preg_replace("/.*\[(.+?)\].*/", "\\1", $sourceField); // attempt to extract the abbreviated journal name from the source field
  2929. $extractedSourceFieldData = preg_replace("/\./", "", $extractedSourceFieldData); // remove any dots from the abbreviated journal name
  2930. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $extractedSourceFieldData)) // if all of the words within the abbreviated journal name are uppercase, we attempt to convert the string to something more readable:
  2931. // perform case transformation (e.g. convert "BALT SEA ENVIRON PROC" into "Balt Sea Environ Proc")
  2932. $extractedSourceFieldData = changeCase('title', $extractedSourceFieldData);
  2933. $fieldArray[] = "JA: Abbrev Journal Name\r\n " . $extractedSourceFieldData; // add field "JA: Abbrev Journal Name" to the array of fields (note that this field normally does NOT occur within the CSA full record format!)
  2934. }
  2935. // (END GENERATE EXTRA FIELDS)
  2936. // LOOP OVER EACH FIELD:
  2937. foreach ($fieldArray as $singleField) // for each field within the current record...
  2938. {
  2939. $singleField = preg_replace("/^(\w\w: [^\r\n]+)[\r\n]+ {4,4}/", "\\1___LabelDataSplitter___", $singleField); // insert a unique text string between the field identifier and the field data
  2940. $fieldLabelPlusDataArray = preg_split("/___LabelDataSplitter___/", $singleField); // split each field into a 2-element array containing [0] the field identifier and [1] the field data
  2941. $fieldLabel = $fieldLabelPlusDataArray[0];
  2942. $fieldData = $fieldLabelPlusDataArray[1];
  2943. $fieldData = preg_replace("/\s{2,}/", " ", $fieldData); // remove any hard returns and extra spaces within the data string
  2944. $fieldData = trim($fieldData); // remove any preceeding and trailing whitespace from the field data
  2945. if (preg_match("/AU: Author/", $fieldLabel))
  2946. {
  2947. $fieldData = preg_replace("/\*/", "", $fieldData); // remove any asterisk ("*")
  2948. $fieldData = standardizePersonNames($fieldData, true, " *; *", " *, *", true); // standardize person names
  2949. }
  2950. elseif (preg_match("/ED: Editor/", $fieldLabel))
  2951. {
  2952. $fieldData = preg_replace("/ \(eds?\)(?= *$| *;)/", "", $fieldData); // remove " (ed)" and/or " (eds)"
  2953. $fieldData = standardizePersonNames($fieldData, true, " *; *", " *, *", true); // standardize person names
  2954. }
  2955. elseif (preg_match("/TI: Title|AB: Abstract/", $fieldLabel))
  2956. {
  2957. if (preg_match("/TI: Title/", $fieldLabel))
  2958. {
  2959. $fieldData = preg_replace("/--/", "-", $fieldData); // remove en-dash markup
  2960. $fieldData = preg_replace("/ *\. *$/", "", $fieldData); // remove any dot from end of title
  2961. }
  2962. if (preg_match("/ su(b|per)\(.+?\)/", $fieldData))
  2963. $fieldData = preg_replace("/ (su(?:b|per))\((.+?)\)/", "[\\1:\\2]", $fieldData); // transform " sub(...)" & " super(...)" markup into "[sub:...]" & "[super:...]" markup
  2964. if (preg_match("/(?<= )mu /", $fieldData))
  2965. $fieldData = preg_replace("/(?<= )mu /", "[mu]", $fieldData); // transform "mu " markup into "[mu]" markup
  2966. }
  2967. // BUILD FIELD PARAMETERS:
  2968. // build an array of key/value pairs:
  2969. // "AU: Author":
  2970. if (preg_match("/AU: Author/", $fieldLabel))
  2971. $fieldParametersArray['author'] = $fieldData;
  2972. // "TI: Title":
  2973. elseif (preg_match("/TI: Title/", $fieldLabel))
  2974. $fieldParametersArray['title'] = $fieldData;
  2975. // "PT: Publication Type":
  2976. elseif (preg_match("/PT: Publication Type/", $fieldLabel)) // could also check for "DT: Document Type" (but DT was added only recently)
  2977. {
  2978. if (preg_match("/[;:,.]/", $fieldData)) // if the "PT: Publication Type" field contains a delimiter (e.g. like: "Journal Article; Conference")
  2979. {
  2980. $correctDocumentType = preg_replace("/(.+?)\s*[;:,.]\s*.*/", "\\1", $fieldData); // extract everything before this delimiter
  2981. $additionalDocumentTypeInfo = preg_replace("/.*?\s*[;:,.]\s*(.+)/", "\\1", $fieldData); // extract everything after this delimiter
  2982. $additionalDocumentTypeInfo = $additionalDocumentTypeInfo; // this info will be appended to any notes field data (see below)
  2983. }
  2984. else // we take the "PT: Publication Type" field contents as they are
  2985. $correctDocumentType = $fieldData;
  2986. // Note that for books the "PT: Publication Type" field will always start with "Book Monograph", no matter whether the referenced
  2987. // publication is a whole book or just a book chapter within that book! This is a design flaw within the CSA full record format.
  2988. // So we can only apply some "good guessing" whether the current record actually references a complete book or just a book chapter:
  2989. if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
  2990. {
  2991. // and if the source field contains some page specification like "213 pp." (AND NOT something like "pp. 76-82" or "p. 216")...
  2992. if (preg_match("/[\d,]+ *pp\b/i", $sourceField) AND !preg_match("/(?<=\W)pp?[. ]+[\w\/,-]+/i", $sourceField))
  2993. $correctDocumentType = "Book Whole"; // ...we assume its a whole book
  2994. else
  2995. $correctDocumentType = "Book Chapter"; // ...otherwise we assume its a book chapter (which may NOT always be correct!)
  2996. }
  2997. $fieldParametersArray['type'] = $correctDocumentType;
  2998. }
  2999. // "PY: Publication Year":
  3000. elseif (preg_match("/PY: Publication Year/", $fieldLabel))
  3001. $fieldParametersArray['year'] = $fieldData;
  3002. // "JN: Journal Name":
  3003. elseif (preg_match("/JN: Journal Name/", $fieldLabel))
  3004. {
  3005. // if the current record is of type "Book Monograph" AND the field "JN: Journal Name" was given within the *original* record data (i.e., before adding stuff to it):
  3006. if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord) AND preg_match("/^JN: Journal Name *[\r\n]+ {4,4}/m", $singleRecord))
  3007. // for book monographs the publication title is given in "MT: Monograph Title"; if a "JN: Journal Name" was originally provided as well, we assume, it's the series title:
  3008. $fieldParametersArray['series_title'] = $fieldData;
  3009. else
  3010. $fieldParametersArray['publication'] = $fieldData;
  3011. }
  3012. // "JA: Abbrev Journal Name":
  3013. elseif (preg_match("/JA: Abbrev Journal Name/", $fieldLabel))
  3014. {
  3015. if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
  3016. // for book monographs the publication title is given in "MT: Monograph Title"; if a "JA: Abbrev Journal Name" is provided as well, we assume, it's the abbreviated series title:
  3017. $fieldParametersArray['abbrev_series_title'] = $fieldData;
  3018. else
  3019. $fieldParametersArray['abbrev_journal'] = $fieldData;
  3020. }
  3021. // "MT: Monograph Title":
  3022. elseif (preg_match("/MT: Monograph Title/", $fieldLabel))
  3023. {
  3024. // if the source field contains some page specification like "213 pp." (AND NOT something like "pp. 76-82" or "p. 216")...
  3025. if (preg_match("/[\d,]+ *pp\b/i", $sourceField) AND !preg_match("/(?<=\W)pp?[. ]+[\w\/,-]+/i", $sourceField))
  3026. // ...we assume its a whole book (see above comment), in which case we assign the monograph title to the series title field:
  3027. $fieldParametersArray['series_title'] = $fieldData;
  3028. else
  3029. $fieldParametersArray['publication'] = $fieldData;
  3030. }
  3031. // "JV: Journal Volume":
  3032. elseif (preg_match("/JV: Journal Volume/", $fieldLabel))
  3033. {
  3034. if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
  3035. // for book monographs, if there's a volume given, we assume, it's the series volume:
  3036. $fieldParametersArray['series_volume'] = $fieldData;
  3037. else
  3038. $fieldParametersArray['volume'] = $fieldData;
  3039. }
  3040. // "JI: Journal Issue":
  3041. elseif (preg_match("/JI: Journal Issue/", $fieldLabel))
  3042. {
  3043. if (preg_match("/^(PT: Publication Type\s+Book Monograph|DT: Document Type\s+B)/m", $singleRecord)) // if the current record is of type "Book Monograph"
  3044. // for book monographs, if there's an issue given, we assume, it's the series issue:
  3045. $fieldParametersArray['series_issue'] = $fieldData;
  3046. else
  3047. $fieldParametersArray['issue'] = $fieldData;
  3048. }
  3049. // "JP: Journal Pages":
  3050. elseif (preg_match("/JP: Journal Pages/", $fieldLabel))
  3051. $fieldParametersArray['pages'] = $fieldData;
  3052. // "AF: Affiliation" & "AF: Author Affilition":
  3053. elseif (preg_match("/AF: (Author )?Affilia?tion/", $fieldLabel))
  3054. $fieldParametersArray['address'] = $fieldData;
  3055. // "CA: Corporate Author":
  3056. elseif (preg_match("/CA: Corporate Author/", $fieldLabel))
  3057. $fieldParametersArray['corporate_author'] = $fieldData;
  3058. // "DE: Descriptors":
  3059. elseif (preg_match("/DE: Descriptors/", $fieldLabel)) // currently, the fields "KW: Keywords" and "ID: Identifiers" are ignored!
  3060. $fieldParametersArray['keywords'] = $fieldData;
  3061. // "AB: Abstract":
  3062. elseif (preg_match("/AB: Abstract/", $fieldLabel))
  3063. $fieldParametersArray['abstract'] = $fieldData;
  3064. // "PB: Publisher":
  3065. elseif (preg_match("/PB: Publisher/", $fieldLabel))
  3066. {
  3067. if (preg_match("/^[$upper\W\d]+$/$patternModifiers", $fieldData)) // if all of the words within the publisher name are uppercase, we attempt to convert the string to something more readable:
  3068. // perform case transformation (e.g. convert "ELSEVIER SCIENCE B.V." into "Elsevier Science B.V.")
  3069. $fieldData = changeCase('title', $fieldData);
  3070. $fieldParametersArray['publisher'] = $fieldData;
  3071. }
  3072. // "ED: Editor":
  3073. elseif (preg_match("/ED: Editor/", $fieldLabel))
  3074. $fieldParametersArray['editor'] = $fieldData;
  3075. // "LA: Language":
  3076. elseif (preg_match("/LA: Language/", $fieldLabel))
  3077. $fieldParametersArray['language'] = $fieldData;
  3078. // "SL: Summary Language":
  3079. elseif (preg_match("/SL: Summary Language/", $fieldLabel))
  3080. $fieldParametersArray['summary_language'] = $fieldData;
  3081. // "OT: Original Title":
  3082. elseif (preg_match("/OT: Original Title/", $fieldLabel))
  3083. $fieldParametersArray['orig_title'] = $fieldData;
  3084. // "IS: ISSN":
  3085. elseif (preg_match("/IS: ISSN/", $fieldLabel))
  3086. $fieldParametersArray['issn'] = $fieldData;
  3087. // "IB: ISBN":
  3088. elseif (preg_match("/IB: ISBN/", $fieldLabel))
  3089. $fieldParametersArray['isbn'] = $fieldData;
  3090. // "ER: Environmental Regime":
  3091. elseif (preg_match("/ER: Environmental Regime/", $fieldLabel))
  3092. $environmentalRegime = $fieldData; // this info will be appended to any notes field data (see below)
  3093. // "CF: Conference":
  3094. elseif (preg_match("/CF: Conference/", $fieldLabel))
  3095. $fieldParametersArray['conference'] = $fieldData;
  3096. // "NT: Notes":
  3097. elseif (preg_match("/NT: Notes/", $fieldLabel))
  3098. $fieldParametersArray['notes'] = $fieldData;
  3099. // "DO: DOI":
  3100. elseif (preg_match("/DO: DOI/", $fieldLabel))
  3101. $fieldParametersArray['doi'] = $fieldData;
  3102. }
  3103. // (END LOOP OVER EACH FIELD)
  3104. if (!empty($showSource)) // if we're supposed to display the original source data
  3105. // append original source field data (they will be presented within the header message of 'record.php' for easy comparison with the extracted data):
  3106. $fieldParametersArray['source'] = $sourceField;
  3107. // we'll hack the "notes" element in order to append additional info:
  3108. // (this cannot be done earlier above since we don't know about the presence & order of fields within the source text!)
  3109. if (!empty($additionalDocumentTypeInfo)) // if the "PT: Publication Type" field contains some additional info
  3110. {
  3111. if (isset($fieldParametersArray['notes'])) // and if the notes element is present
  3112. $fieldParametersArray['notes'] = $fieldParametersArray['notes'] . "; " . $additionalDocumentTypeInfo; // append additional info from "PT: Publication Type" field
  3113. else // the notes parameter wasn't specified yet
  3114. $fieldParametersArray['notes'] = $additionalDocumentTypeInfo; // add notes element with additional info from "PT: Publication Type" field
  3115. }
  3116. if (!empty($environmentalRegime)) // if the "ER: Environmental Regime" field contains some data
  3117. {
  3118. if (isset($fieldParametersArray['notes'])) // and if the notes element is present
  3119. $fieldParametersArray['notes'] = $fieldParametersArray['notes'] . "; " . $environmentalRegime; // append "ER: Environmental Regime" field data
  3120. else // the notes parameter wasn't specified yet
  3121. $fieldParametersArray['notes'] = $environmentalRegime; // add notes element with "ER: Environmental Regime" field data
  3122. }
  3123. // Append the array of extracted field data to the main data array which holds all records to import:
  3124. $parsedRecordsArray[] = $fieldParametersArray;
  3125. }
  3126. }
  3127. // (END LOOP OVER EACH RECORD)
  3128. // ----------------------------------------------------------------
  3129. // BUILD REFBASE IMPORT ARRAY:
  3130. $importDataArray = buildImportArray("refbase", // 'type' - the array format of the 'records' element
  3131. "1.0", // 'version' - the version of the given array structure
  3132. "http://refbase.net/import/csa/", // 'creator' - the name of the script/importer (preferably given as unique URI)
  3133. "Matthias Steffens", // 'author' - author/contact name of the person who's responsible for this script/importer
  3134. "refbase@extracts.de", // 'contact' - author's email/contact address
  3135. array('prefix_call_number' => "true"), // 'options' - array with settings that control the behaviour of the 'addRecords()' function
  3136. $parsedRecordsArray); // 'records' - array of record(s) (with each record being a sub-array of fields)
  3137. return array($importDataArray, $recordsCount, $importRecordNumbersRecognizedFormatArray, $importRecordNumbersNotRecognizedFormatArray, $errors);
  3138. }
  3139. // --------------------------------------------------------------------
  3140. /*
  3141. // NOTE: by default, this function is currently disabled, since it uses DOM which is part of PHP 5 but must
  3142. // be installed as a separate PEAR extension for PHP 4. In order to provide widest compatibility with PHP 4,
  3143. // this function should be modified so that it makes use of ActiveLink's XML package instead:
  3144. // <http://www.active-link.com/software/>
  3145. // PUBMED TO CSA
  3146. // This function takes a PubMed ID and fetches corresponding PubMed XML record data from the PubMed server.
  3147. // Record data will be converted to CSA format which can be imported via 'import_csa_modify.php'.
  3148. //
  3149. // Authors: this function was originally written in Python by Andreas Hildebrandt <anhi@bioinf.uni-sb.de>
  3150. // and was ported to PHP by Marc Sturm <sturm@informatik.uni-tuebingen.de>
  3151. function pubmedToCsa($pubmedID)
  3152. {
  3153. global $contentTypeCharset;
  3154. $months = array('Jan' => '01', 'Feb' => '02', 'Mar' => '03', 'Apr' => '04', 'May' => '05', 'Jun' => '06',
  3155. 'Jul' => '07', 'Aug' => '08', 'Sep' => '09', 'Oct' => '10', 'Nov' => '11', 'Dec' => '12');
  3156. $use_proxy=false;
  3157. function proxy_url($proxy_url)
  3158. {
  3159. $proxy_name = 'www-cache.informatik.uni-tuebingen.de';
  3160. $proxy_port = 3128;
  3161. $proxy_user = '';
  3162. $proxy_pass = '';
  3163. $proxy_cont = '';
  3164. $proxy_fp = fsockopen($proxy_name, $proxy_port);
  3165. if (!$proxy_fp) {return false;}
  3166. fputs($proxy_fp, "GET $proxy_url HTTP/1.0\r\nHost: $proxy_name\r\n");
  3167. fputs($proxy_fp, "Proxy-Authorization: Basic " . base64_encode("$proxy_user:$proxy_pass") . "\r\n\r\n");
  3168. while(!feof($proxy_fp)) { $proxy_cont .= fread($proxy_fp,4096); }
  3169. fclose($proxy_fp);
  3170. $proxy_cont = substr($proxy_cont, strpos($proxy_cont,"\r\n\r\n")+4);
  3171. return $proxy_cont;
  3172. }
  3173. if ($use_proxy)
  3174. $file = proxy_url("http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=".escapeshellcmd($pubmedID)."&retmode=xml");
  3175. else
  3176. $file = file_get_contents("http://www.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=".escapeshellcmd($pubmedID)."&retmode=xml");
  3177. $doc = DOMDocument::loadXML($file);
  3178. $doc->preserveWhiteSpace = false;
  3179. $xpath = new DOMXPath($doc);
  3180. //-------------------------------------------------------------------------
  3181. // This parses the XML data:
  3182. // 1) Find the article (assume only one at this point...)
  3183. // 2) Do we need to add "et.al" to Authors?
  3184. // 3) Only one affiliation...
  3185. //-------------------------------------------------------------------------
  3186. $result = "";
  3187. $articles = $doc->getElementsByTagName('PubmedArticle');
  3188. foreach ($articles as $ref)
  3189. {
  3190. $med = $ref->getElementsByTagName('MedlineCitation')->item(0);
  3191. $article = $med->getElementsByTagName('Article')->item(0);
  3192. $title = $xpath->query("ArticleTitle/text()", $article)->item(0)->nodeValue;
  3193. $result .= "TI: Title\n $title\n";
  3194. $author_list = $article->getElementsByTagName('AuthorList')->item(0);
  3195. if ($author_list->attributes->getNamedItem('CompleteYN')->value == 'N')
  3196. $add_et_al = true;
  3197. else
  3198. $add_et_al = false;
  3199. $authors = $author_list->getElementsByTagName('Author');
  3200. $author_line = "";
  3201. foreach ($authors as $author)
  3202. {
  3203. $author_line .= $xpath->query("LastName/text()", $author)->item(0)->nodeValue;
  3204. $author_line .= ", ";
  3205. $forename = $xpath->query("ForeName/text()", $author);
  3206. if ($forename->length == 0)
  3207. $forename = $xpath->query("Initials/text()", $author);
  3208. if ($forename->length > 0)
  3209. $author_line .= $forename->item(0)->nodeValue;
  3210. $author_line .= "; ";
  3211. }
  3212. if ($add_et_al)
  3213. $author_line = substr($author_line,0,-2) . " et al.";
  3214. else
  3215. $author_line = substr($author_line,0,-2);
  3216. $result .= "AU: Author\n $author_line\n";
  3217. $affiliation = $xpath->query("Affiliation/text()", $article);
  3218. if ($affiliation->length > 0)
  3219. $result .= "AF: Affiliation\n ".$affiliation->item(0)->nodeValue."\n";
  3220. if ($ref->getElementsByTagName('MedlineJournalInfo')->length == 0) {
  3221. print "No useable source information given!";
  3222. exit(1);
  3223. }
  3224. $source = $xpath->query("MedlineJournalInfo/MedlineTA/text()", $med)->item(0)->nodeValue.". ";
  3225. if ($xpath->query("Journal/JournalIssue/Volume/text()", $article)->length > 0)
  3226. $source .= "Vol. " . $xpath->query("Journal/JournalIssue/Volume/text()", $article)->item(0)->nodeValue;
  3227. if ($xpath->query("Journal/JournalIssue/Issue/text()", $article)->length > 0)
  3228. $source .= " no. " . $xpath->query("Journal/JournalIssue/Issue/text()", $article)->item(0)->nodeValue;
  3229. if ($xpath->query("Pagination/MedlinePgn/text()", $article)->length > 0)
  3230. $source .= ", pp. " . $xpath->query("Pagination/MedlinePgn/text()", $article)->item(0)->nodeValue;
  3231. if ($xpath->query("Journal/JournalIssue/PubDate/Year", $article)->length > 0)
  3232. $source .= ". " . $xpath->query("Journal/JournalIssue/PubDate/Year/text()", $article)->item(0)->nodeValue . ".";
  3233. if ($source != "")
  3234. $result .= "SO: Source\n " . $source . "\n";
  3235. if ($xpath->query("Journal/ISSN", $article)->length > 0)
  3236. $result .= "IS: ISSN\n " . $xpath->query("Journal/ISSN/text()", $article)->item(0)->nodeValue . "\n";
  3237. if ($xpath->query("Abstract/AbstractText", $article)->length > 0)
  3238. $result .= "AB: Abstract\n " . $xpath->query("Abstract/AbstractText/text()", $article)->item(0)->nodeValue . "\n";
  3239. if ($xpath->query("Language", $article)->length > 0)
  3240. $result .= "LA: Language\n " . $xpath->query("Language/text()", $article)->item(0)->nodeValue . "\n";
  3241. $pubdate = "";
  3242. if ($xpath->query("Journal/JournalIssue/PubDate", $article)->length > 0)
  3243. {
  3244. $year = $xpath->query("Journal/JournalIssue/PubDate/Year/text()", $article);
  3245. if ($year > 0)
  3246. {
  3247. $pubdate = $year->item(0)->nodeValue;
  3248. $month = $xpath->query("Journal/JournalIssue/PubDate/Month/text()", $article);
  3249. if ($month > 0)
  3250. {
  3251. $pubdate .= $months[$month->item(0)->nodeValue];
  3252. $day = $xpath->query("Journal/JournalIssue/PubDate/Day/text()", $article);
  3253. if ($day->length > 0)
  3254. $pubdate .= $day->item(0)->nodeValue;
  3255. else
  3256. $pubdate .= "00";
  3257. }else{
  3258. $pubdate = $pubdate . "00";
  3259. }
  3260. }
  3261. $result .= "PD: Publication Date\n " . $pubdate . "\n";
  3262. }
  3263. $ptl = $article->getElementsByTagName('PublicationTypeList');
  3264. $publication_type = "";
  3265. if ($ptl->length > 0)
  3266. {
  3267. $pts = $xpath->query("PublicationTypeList/PublicationType/text()", $article);
  3268. for ($i=0; $i<$pts->length ; ++$i)
  3269. //{
  3270. $publication_type .= $pts->item($i)->nodeValue . "; ";
  3271. //}
  3272. }
  3273. if ($publication_type != "")
  3274. $result .= "PT: Publication Type\n " . substr($publication_type,0,-2) . "\n";
  3275. // collect all MeshHeadings and put them as descriptors.
  3276. // this currently ignores all other types of keywords
  3277. $descs = $xpath->query("MeshHeadingList/MeshHeading/DescriptorName/text()", $med);
  3278. $desc_line = "";
  3279. for ($i=0; $i<$descs->length ; ++$i)
  3280. $desc_line .= $descs->item($i)->nodeValue . "; ";
  3281. if ($desc_line != "")
  3282. $result .= "DE: Descriptors\n " . substr($desc_line,0,-2) . "\n";
  3283. $year = $xpath->query("Journal/JournalIssue/PubDate/Year/text()", $article) ;
  3284. if ($year > 0)
  3285. $result .= "PY: Publication Year\n " . $year->item(0)->nodeValue . "\n";
  3286. }
  3287. if ($contentTypeCharset == "ISO-8859-1")
  3288. $result = iconv("UTF-8", "ISO-8859-1//TRANSLIT", $result); // convert text from Unicode UTF-8 encoding to ISO Latin 1
  3289. return $result;
  3290. }
  3291. */
  3292. // --------------------------------------------------------------------
  3293. ?>