{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f43c8f32-7466-4ead-889b-5950bc9e750a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pymongo\n",
    "\n",
    "from pprint import pprint\n",
    "\n",
    "# print(os.environ)\n",
    "# import mdq_setenv\n",
    "# mdq_setenv.mdq_setenv()\n",
    "# print(os.environ)\n",
    "\n",
    "TOP_X=10\n",
    "SepBar = \"-\" * 10\n",
    "\n",
    "port    = os.environ[\"MDQ_MONGODB_PORT\"]\n",
    "db_name = os.environ[\"MDQ_MONGODB_DB\"]\n",
    "\n",
    "client = pymongo.MongoClient(\"mongodb://localhost:\"+port+\"/\")\n",
    "database = client[db_name]\n",
    "xref_collection = database[\"xref\"]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "f8cda52c-f945-4fd4-9293-5e75be3f28ac",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Estimated num_records = 136634831\n"
     ]
    }
   ],
   "source": [
    "# count_documents() runs the query/find, and counts the items returned, so is computationally expensive to run\n",
    "#num_records = xref_collection.count_documents({})\n",
    "\n",
    "# estimated_document_count() much faster, when we're interested in the count for complete collection\n",
    "est_num_records = xref_collection.estimated_document_count()\n",
    "\n",
    "print(f\"Estimated num_records = {est_num_records}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "d55f6f69-a820-4d5e-90c5-a6524d3441bb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'DOI': '10.24034/kreanova.v3i1.5327',\n",
      " '_id': ObjectId('67a53c273cbfceb3083b9732'),\n",
      " 'authors': [{'affiliation': [],\n",
      "              'family': 'Fitria',\n",
      "              'given': 'Yunita',\n",
      "              'sequence': 'first'},\n",
      "             {'affiliation': [],\n",
      "              'family': 'Kusumawardani',\n",
      "              'given': 'Anisa',\n",
      "              'sequence': 'additional'},\n",
      "             {'affiliation': [],\n",
      "              'family': 'Nur Khairin',\n",
      "              'given': 'Fibriyani',\n",
      "              'sequence': 'additional'},\n",
      "             {'affiliation': [],\n",
      "              'family': 'Syakura',\n",
      "              'given': 'Muhammad Abadan',\n",
      "              'sequence': 'additional'}],\n",
      " 'dateCreated': '2023-01-11T08:59:40Z',\n",
      " 'dateDeposited': '2023-01-11T08:59:41Z',\n",
      " 'dateIndexed': '2023-01-12T06:00:54Z',\n",
      " 'issued': [[2023, 1, 11]],\n",
      " 'title': 'PENGAYAAN PENGETAHUAN SISWA SMA 3 SAMARINDA MELALUI PENGENALAN DAN '\n",
      "          'PELATIHAN AKUNTANSI SYARIAH',\n",
      " 'typeOfWork': 'journal-article'}\n"
     ]
    }
   ],
   "source": [
    "# Pretty-print the first item returned from the collection as a sample\n",
    "xref_sample = xref_collection.find_one({})\n",
    "pprint(xref_sample)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "60524738-6146-4328-9258-795aa6863310",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Away to get earliest dateCreated record, using find().sort() -- this can take a few minutes\n"
     ]
    }
   ],
   "source": [
    "# Get the earliest date of a publication\n",
    "print(\"Away to get earliest dateCreated record, using find().sort() -- this can take a few minutes\")\n",
    "xref_sorted_date_created_query = xref_collection.find().sort(\"dateCreated\")\n",
    "xref_earliest_date_created_rec = xref_sorted_date_created_query[0];\n",
    "xref_earliest_date_created = xref_earliest_date_created_rec[\"dateCreated\"]\n",
    "\n",
    "print(f\"Earliest dateCreated = {xref_earliest_date_created}\")\n",
    "print(SepBar)\n",
    "pprint(xref_earliest_date_created_rec)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "edfe053c-12cf-4e02-b8d5-c4aaebd348b7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XRef records with non-ASCII chars in title (first 10)\n",
      "{'DOI': '10.34238/tnu-jst.6190',\n",
      " 'title': 'XÂY DỰNG HỆ THỐNG KẾT NỐI VÀ ĐIỀU KHIỂN THIẾT BỊ  TRONG BỘ THÍ '\n",
      "          'NGHIỆM THỦY LỰC WS200 BẰNG PLC'}\n",
      "----------\n",
      "{'DOI': '10.35993/ijitl.v8i2.2399',\n",
      " 'title': 'Evaluation of Secondary School Science Teachers’ Knowledge about '\n",
      "          'Nature of Science'}\n",
      "----------\n",
      "{'DOI': '10.36059/978-966-397-268-8-14',\n",
      " 'title': 'ПРИНЦИПИ ОРГАНІЗАЦІЇ АУДИТУ ТА АУДИТОРСЬКОЇ ДІЯЛЬНОСТІ В УКРАЇНІ'}\n",
      "----------\n",
      "{'DOI': '10.36059/978-966-397-268-8-15',\n",
      " 'title': 'СУЧАСНИЙ СТАН ДЕРЖАВНОЇ ПІДТРИМКИ СУБ’ЄКТІВ ГОСПОДАРЮВАННЯ У СФЕРІ '\n",
      "          'ТУРИЗМУ'}\n",
      "----------\n",
      "{'DOI': '10.36059/978-966-397-268-8-27',\n",
      " 'title': 'АДМІНІСТРАТИВНО-ПРАВОВІ МЕТОДИ НАГЛЯДУ (КОНТРОЛЮ) У СФЕРІ ОХОРОНИ '\n",
      "          'НАВКОЛИШНЬОГО ПРИРОДНОГО СЕРЕДОВИЩА'}\n",
      "----------\n",
      "{'DOI': '10.36684/80-1-2022-107-111',\n",
      " 'title': 'НАУЧНЫЕ КАДРЫ И СТАНОВЛЕНИЕ АКАДЕМИЧЕСКОЙ ГУМАНИТАРНОЙ НАУКИ В '\n",
      "          'БАШКОРТОСТАНЕ В ПЕРВЫЕ ГОДЫ СОВЕТСКОЙ ВЛАСТИ'}\n",
      "----------\n",
      "{'DOI': '10.36007/eruedu.2022.4.015-024',\n",
      " 'title': 'A kelet–nyugati párbeszéd az „enyhülés fogságában” töltött '\n",
      "          'időszakban 1975–1985 között'}\n",
      "----------\n",
      "{'DOI': '10.37992/2022.1304.176',\n",
      " 'title': 'Interpreting genotype × environment interaction in greengram (Vigna '\n",
      "          'radiata L. Wilczek) using Eberhart and Russell Model'}\n",
      "----------\n",
      "{'DOI': '10.37884/4-2022/06',\n",
      " 'title': 'САУЫННАН КЕЙІН СИЫРЛАРДЫҢ ЖЕЛІН ҮРПІСІН САНИТАРИЯЛЫҚ ӨҢДЕУГЕ '\n",
      "          'АРНАЛҒАН ПРЕПАРАТТАРДЫҢ ТИІМДІЛІГІН САЛЫСТЫРМАЛЫ БАҒАЛАУ'}\n",
      "----------\n",
      "{'DOI': '10.38116/ppp62art6',\n",
      " 'title': 'Revisando o desenvolvimento em educação dos municípios brasileiros '\n",
      "          ': uma nova proposta de mensuração'}\n",
      "----------\n"
     ]
    }
   ],
   "source": [
    "xref_titles_with_non_ascii_cursor = xref_collection.find({\"title\":{\"$regex\":\"[\\u0080-\\uFFFF]\",\"$options\":\"\"}},{\"DOI\": 1, \"title\": 1, \"_id\":0})\n",
    "\n",
    "print(f\"XRef records with non-ASCII chars in title (first {TOP_X})\")\n",
    "\n",
    "for i in range(0,TOP_X):\n",
    "    xref_rec = xref_titles_with_non_ascii_cursor[i]\n",
    "    pprint(xref_rec)\n",
    "    print(SepBar)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "46794f48-e286-471b-9264-87dc16727c4e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "XRef records with non-Latin1 chars in title (first 10)\n",
      "{'DOI': '10.34238/tnu-jst.6190',\n",
      " 'title': 'XÂY DỰNG HỆ THỐNG KẾT NỐI VÀ ĐIỀU KHIỂN THIẾT BỊ  TRONG BỘ THÍ '\n",
      "          'NGHIỆM THỦY LỰC WS200 BẰNG PLC'}\n",
      "----------\n",
      "{'DOI': '10.35993/ijitl.v8i2.2399',\n",
      " 'title': 'Evaluation of Secondary School Science Teachers’ Knowledge about '\n",
      "          'Nature of Science'}\n",
      "----------\n",
      "{'DOI': '10.36059/978-966-397-268-8-14',\n",
      " 'title': 'ПРИНЦИПИ ОРГАНІЗАЦІЇ АУДИТУ ТА АУДИТОРСЬКОЇ ДІЯЛЬНОСТІ В УКРАЇНІ'}\n",
      "----------\n",
      "{'DOI': '10.36059/978-966-397-268-8-15',\n",
      " 'title': 'СУЧАСНИЙ СТАН ДЕРЖАВНОЇ ПІДТРИМКИ СУБ’ЄКТІВ ГОСПОДАРЮВАННЯ У СФЕРІ '\n",
      "          'ТУРИЗМУ'}\n",
      "----------\n",
      "{'DOI': '10.36059/978-966-397-268-8-27',\n",
      " 'title': 'АДМІНІСТРАТИВНО-ПРАВОВІ МЕТОДИ НАГЛЯДУ (КОНТРОЛЮ) У СФЕРІ ОХОРОНИ '\n",
      "          'НАВКОЛИШНЬОГО ПРИРОДНОГО СЕРЕДОВИЩА'}\n",
      "----------\n",
      "{'DOI': '10.36684/80-1-2022-107-111',\n",
      " 'title': 'НАУЧНЫЕ КАДРЫ И СТАНОВЛЕНИЕ АКАДЕМИЧЕСКОЙ ГУМАНИТАРНОЙ НАУКИ В '\n",
      "          'БАШКОРТОСТАНЕ В ПЕРВЫЕ ГОДЫ СОВЕТСКОЙ ВЛАСТИ'}\n",
      "----------\n",
      "{'DOI': '10.36007/eruedu.2022.4.015-024',\n",
      " 'title': 'A kelet–nyugati párbeszéd az „enyhülés fogságában” töltött '\n",
      "          'időszakban 1975–1985 között'}\n",
      "----------\n",
      "{'DOI': '10.37884/4-2022/06',\n",
      " 'title': 'САУЫННАН КЕЙІН СИЫРЛАРДЫҢ ЖЕЛІН ҮРПІСІН САНИТАРИЯЛЫҚ ӨҢДЕУГЕ '\n",
      "          'АРНАЛҒАН ПРЕПАРАТТАРДЫҢ ТИІМДІЛІГІН САЛЫСТЫРМАЛЫ БАҒАЛАУ'}\n",
      "----------\n",
      "{'DOI': '10.34238/tnu-jst.5971',\n",
      " 'title': 'NGHIÊN CỨU MÔ PHỎNG NGUỒN ĐỘNG LỰC XE HYBRID  TRÊN PHẦN MỀM '\n",
      "          'AVL-CRUISE'}\n",
      "----------\n",
      "{'DOI': '10.34238/tnu-jst.6580',\n",
      " 'title': 'THIẾT KẾ CÁC BỘ ĐIỀU KHIỂN PID VÀ FLC-SUGENO TỐI ƯU  CHO HỆ THỐNG '\n",
      "          'QUẠT GIÓ - CÁNH PHẲNG SỬ DỤNG THUẬT TOÁN PSO'}\n",
      "----------\n"
     ]
    }
   ],
   "source": [
    "xref_titles_with_non_latin1_cursor = xref_collection.find({\"title\":{\"$regex\":\"[\\u0100-\\uFFFF]\",\"$options\":\"\"}},{\"DOI\": 1, \"title\": 1, \"_id\":0})\n",
    "\n",
    "print(f\"XRef records with non-Latin1 chars in title (first {TOP_X})\")\n",
    "\n",
    "for i in range(0,TOP_X):\n",
    "    xref_rec = xref_titles_with_non_latin1_cursor[i]\n",
    "    pprint(xref_rec)\n",
    "    print(SepBar)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5dbd5db-e27f-48f8-b35f-16ae0a4e248d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}