{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "f43c8f32-7466-4ead-889b-5950bc9e750a", "metadata": {}, "outputs": [], "source": [ "import os\n", "import pymongo\n", "\n", "from pprint import pprint\n", "\n", "# print(os.environ)\n", "# import mdq_setenv\n", "# mdq_setenv.mdq_setenv()\n", "# print(os.environ)\n", "\n", "TOP_X=10\n", "SepBar = \"-\" * 10\n", "\n", "port = os.environ[\"MDQ_MONGODB_PORT\"]\n", "db_name = os.environ[\"MDQ_MONGODB_DB\"]\n", "\n", "client = pymongo.MongoClient(\"mongodb://localhost:\"+port+\"/\")\n", "database = client[db_name]\n", "xref_collection = database[\"xref\"]\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "f8cda52c-f945-4fd4-9293-5e75be3f28ac", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Estimated num_records = 136634831\n" ] } ], "source": [ "# count_documents() runs the query/find, and counts the items returned, so is computationally expensive to run\n", "#num_records = xref_collection.count_documents({})\n", "\n", "# estimated_document_count() much faster, when we're interested in the count for complete collection\n", "est_num_records = xref_collection.estimated_document_count()\n", "\n", "print(f\"Estimated num_records = {est_num_records}\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "d55f6f69-a820-4d5e-90c5-a6524d3441bb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'DOI': '10.24034/kreanova.v3i1.5327',\n", " '_id': ObjectId('67a53c273cbfceb3083b9732'),\n", " 'authors': [{'affiliation': [],\n", " 'family': 'Fitria',\n", " 'given': 'Yunita',\n", " 'sequence': 'first'},\n", " {'affiliation': [],\n", " 'family': 'Kusumawardani',\n", " 'given': 'Anisa',\n", " 'sequence': 'additional'},\n", " {'affiliation': [],\n", " 'family': 'Nur Khairin',\n", " 'given': 'Fibriyani',\n", " 'sequence': 'additional'},\n", " {'affiliation': [],\n", " 'family': 'Syakura',\n", " 'given': 'Muhammad Abadan',\n", " 'sequence': 'additional'}],\n", " 'dateCreated': '2023-01-11T08:59:40Z',\n", " 'dateDeposited': '2023-01-11T08:59:41Z',\n", " 'dateIndexed': '2023-01-12T06:00:54Z',\n", " 'issued': [[2023, 1, 11]],\n", " 'title': 'PENGAYAAN PENGETAHUAN SISWA SMA 3 SAMARINDA MELALUI PENGENALAN DAN '\n", " 'PELATIHAN AKUNTANSI SYARIAH',\n", " 'typeOfWork': 'journal-article'}\n" ] } ], "source": [ "# Pretty-print the first item returned from the collection as a sample\n", "xref_sample = xref_collection.find_one({})\n", "pprint(xref_sample)" ] }, { "cell_type": "code", "execution_count": null, "id": "60524738-6146-4328-9258-795aa6863310", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Away to get earliest dateCreated record, using find().sort() -- this can take a few minutes\n" ] } ], "source": [ "# Get the earliest date of a publication\n", "print(\"Away to get earliest dateCreated record, using find().sort() -- this can take a few minutes\")\n", "xref_sorted_date_created_query = xref_collection.find().sort(\"dateCreated\")\n", "xref_earliest_date_created_rec = xref_sorted_date_created_query[0];\n", "xref_earliest_date_created = xref_earliest_date_created_rec[\"dateCreated\"]\n", "\n", "print(f\"Earliest dateCreated = {xref_earliest_date_created}\")\n", "print(SepBar)\n", "pprint(xref_earliest_date_created_rec)\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "edfe053c-12cf-4e02-b8d5-c4aaebd348b7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XRef records with non-ASCII chars in title (first 10)\n", "{'DOI': '10.34238/tnu-jst.6190',\n", " 'title': 'XÂY DỰNG HỆ THỐNG KẾT NỐI VÀ ĐIỀU KHIỂN THIẾT BỊ TRONG BỘ THÍ '\n", " 'NGHIỆM THỦY LỰC WS200 BẰNG PLC'}\n", "----------\n", "{'DOI': '10.35993/ijitl.v8i2.2399',\n", " 'title': 'Evaluation of Secondary School Science Teachers’ Knowledge about '\n", " 'Nature of Science'}\n", "----------\n", "{'DOI': '10.36059/978-966-397-268-8-14',\n", " 'title': 'ПРИНЦИПИ ОРГАНІЗАЦІЇ АУДИТУ ТА АУДИТОРСЬКОЇ ДІЯЛЬНОСТІ В УКРАЇНІ'}\n", "----------\n", "{'DOI': '10.36059/978-966-397-268-8-15',\n", " 'title': 'СУЧАСНИЙ СТАН ДЕРЖАВНОЇ ПІДТРИМКИ СУБ’ЄКТІВ ГОСПОДАРЮВАННЯ У СФЕРІ '\n", " 'ТУРИЗМУ'}\n", "----------\n", "{'DOI': '10.36059/978-966-397-268-8-27',\n", " 'title': 'АДМІНІСТРАТИВНО-ПРАВОВІ МЕТОДИ НАГЛЯДУ (КОНТРОЛЮ) У СФЕРІ ОХОРОНИ '\n", " 'НАВКОЛИШНЬОГО ПРИРОДНОГО СЕРЕДОВИЩА'}\n", "----------\n", "{'DOI': '10.36684/80-1-2022-107-111',\n", " 'title': 'НАУЧНЫЕ КАДРЫ И СТАНОВЛЕНИЕ АКАДЕМИЧЕСКОЙ ГУМАНИТАРНОЙ НАУКИ В '\n", " 'БАШКОРТОСТАНЕ В ПЕРВЫЕ ГОДЫ СОВЕТСКОЙ ВЛАСТИ'}\n", "----------\n", "{'DOI': '10.36007/eruedu.2022.4.015-024',\n", " 'title': 'A kelet–nyugati párbeszéd az „enyhülés fogságában” töltött '\n", " 'időszakban 1975–1985 között'}\n", "----------\n", "{'DOI': '10.37992/2022.1304.176',\n", " 'title': 'Interpreting genotype × environment interaction in greengram (Vigna '\n", " 'radiata L. Wilczek) using Eberhart and Russell Model'}\n", "----------\n", "{'DOI': '10.37884/4-2022/06',\n", " 'title': 'САУЫННАН КЕЙІН СИЫРЛАРДЫҢ ЖЕЛІН ҮРПІСІН САНИТАРИЯЛЫҚ ӨҢДЕУГЕ '\n", " 'АРНАЛҒАН ПРЕПАРАТТАРДЫҢ ТИІМДІЛІГІН САЛЫСТЫРМАЛЫ БАҒАЛАУ'}\n", "----------\n", "{'DOI': '10.38116/ppp62art6',\n", " 'title': 'Revisando o desenvolvimento em educação dos municípios brasileiros '\n", " ': uma nova proposta de mensuração'}\n", "----------\n" ] } ], "source": [ "xref_titles_with_non_ascii_cursor = xref_collection.find({\"title\":{\"$regex\":\"[\\u0080-\\uFFFF]\",\"$options\":\"\"}},{\"DOI\": 1, \"title\": 1, \"_id\":0})\n", "\n", "print(f\"XRef records with non-ASCII chars in title (first {TOP_X})\")\n", "\n", "for i in range(0,TOP_X):\n", " xref_rec = xref_titles_with_non_ascii_cursor[i]\n", " pprint(xref_rec)\n", " print(SepBar)" ] }, { "cell_type": "code", "execution_count": 6, "id": "46794f48-e286-471b-9264-87dc16727c4e", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "XRef records with non-Latin1 chars in title (first 10)\n", "{'DOI': '10.34238/tnu-jst.6190',\n", " 'title': 'XÂY DỰNG HỆ THỐNG KẾT NỐI VÀ ĐIỀU KHIỂN THIẾT BỊ TRONG BỘ THÍ '\n", " 'NGHIỆM THỦY LỰC WS200 BẰNG PLC'}\n", "----------\n", "{'DOI': '10.35993/ijitl.v8i2.2399',\n", " 'title': 'Evaluation of Secondary School Science Teachers’ Knowledge about '\n", " 'Nature of Science'}\n", "----------\n", "{'DOI': '10.36059/978-966-397-268-8-14',\n", " 'title': 'ПРИНЦИПИ ОРГАНІЗАЦІЇ АУДИТУ ТА АУДИТОРСЬКОЇ ДІЯЛЬНОСТІ В УКРАЇНІ'}\n", "----------\n", "{'DOI': '10.36059/978-966-397-268-8-15',\n", " 'title': 'СУЧАСНИЙ СТАН ДЕРЖАВНОЇ ПІДТРИМКИ СУБ’ЄКТІВ ГОСПОДАРЮВАННЯ У СФЕРІ '\n", " 'ТУРИЗМУ'}\n", "----------\n", "{'DOI': '10.36059/978-966-397-268-8-27',\n", " 'title': 'АДМІНІСТРАТИВНО-ПРАВОВІ МЕТОДИ НАГЛЯДУ (КОНТРОЛЮ) У СФЕРІ ОХОРОНИ '\n", " 'НАВКОЛИШНЬОГО ПРИРОДНОГО СЕРЕДОВИЩА'}\n", "----------\n", "{'DOI': '10.36684/80-1-2022-107-111',\n", " 'title': 'НАУЧНЫЕ КАДРЫ И СТАНОВЛЕНИЕ АКАДЕМИЧЕСКОЙ ГУМАНИТАРНОЙ НАУКИ В '\n", " 'БАШКОРТОСТАНЕ В ПЕРВЫЕ ГОДЫ СОВЕТСКОЙ ВЛАСТИ'}\n", "----------\n", "{'DOI': '10.36007/eruedu.2022.4.015-024',\n", " 'title': 'A kelet–nyugati párbeszéd az „enyhülés fogságában” töltött '\n", " 'időszakban 1975–1985 között'}\n", "----------\n", "{'DOI': '10.37884/4-2022/06',\n", " 'title': 'САУЫННАН КЕЙІН СИЫРЛАРДЫҢ ЖЕЛІН ҮРПІСІН САНИТАРИЯЛЫҚ ӨҢДЕУГЕ '\n", " 'АРНАЛҒАН ПРЕПАРАТТАРДЫҢ ТИІМДІЛІГІН САЛЫСТЫРМАЛЫ БАҒАЛАУ'}\n", "----------\n", "{'DOI': '10.34238/tnu-jst.5971',\n", " 'title': 'NGHIÊN CỨU MÔ PHỎNG NGUỒN ĐỘNG LỰC XE HYBRID TRÊN PHẦN MỀM '\n", " 'AVL-CRUISE'}\n", "----------\n", "{'DOI': '10.34238/tnu-jst.6580',\n", " 'title': 'THIẾT KẾ CÁC BỘ ĐIỀU KHIỂN PID VÀ FLC-SUGENO TỐI ƯU CHO HỆ THỐNG '\n", " 'QUẠT GIÓ - CÁNH PHẲNG SỬ DỤNG THUẬT TOÁN PSO'}\n", "----------\n" ] } ], "source": [ "xref_titles_with_non_latin1_cursor = xref_collection.find({\"title\":{\"$regex\":\"[\\u0100-\\uFFFF]\",\"$options\":\"\"}},{\"DOI\": 1, \"title\": 1, \"_id\":0})\n", "\n", "print(f\"XRef records with non-Latin1 chars in title (first {TOP_X})\")\n", "\n", "for i in range(0,TOP_X):\n", " xref_rec = xref_titles_with_non_latin1_cursor[i]\n", " pprint(xref_rec)\n", " print(SepBar)" ] }, { "cell_type": "code", "execution_count": null, "id": "e5dbd5db-e27f-48f8-b35f-16ae0a4e248d", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.2" } }, "nbformat": 4, "nbformat_minor": 5 }