From be2aa709b8e6e423783ce6e32b61198b17babb51 Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Fri, 5 Jun 2026 16:29:14 +0200 Subject: [PATCH 1/9] Add ETL pipeline: standardizer, transformer, validator, api_retriever, mappings, demo notebook --- .gitignore | 12 +- demo_etl_pipeline.ipynb | 674 ++++++++++++++++++ www/services/__init__.py | 19 +- www/services/etl/___init__.py | 4 + www/services/etl/api_retriever.py | 281 ++++++++ www/services/etl/mappings/__init__.py | 5 + .../etl/mappings/dimensions_mapping.py | 21 + www/services/etl/mappings/openlex_mapping.py | 21 + www/services/etl/mappings/pubmed_mapping.py | 23 + www/services/etl/mappings/scopus_mappin.py | 29 + www/services/etl/standardizer.py | 117 +++ www/services/etl/transformer.py | 175 +++++ www/services/etl/validator.py | 92 +++ 13 files changed, 1455 insertions(+), 18 deletions(-) create mode 100644 demo_etl_pipeline.ipynb create mode 100644 www/services/etl/___init__.py create mode 100644 www/services/etl/api_retriever.py create mode 100644 www/services/etl/mappings/__init__.py create mode 100644 www/services/etl/mappings/dimensions_mapping.py create mode 100644 www/services/etl/mappings/openlex_mapping.py create mode 100644 www/services/etl/mappings/pubmed_mapping.py create mode 100644 www/services/etl/mappings/scopus_mappin.py create mode 100644 www/services/etl/standardizer.py create mode 100644 www/services/etl/transformer.py create mode 100644 www/services/etl/validator.py diff --git a/.gitignore b/.gitignore index 23b99e089..2815e7977 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,14 @@ __pycache__/ bibliovenv/ Bibenv/ -.idea/ \ No newline at end of file +.idea/ + +# ETL test files +scopus_test.csv +scopus_standardized.csv +openalex_standardized.csv +pubmed_standardized.csv +test_etl.py + +# Jupyter checkpoints +.ipynb_checkpoints/ \ No newline at end of file diff --git a/demo_etl_pipeline.ipynb b/demo_etl_pipeline.ipynb new file mode 100644 index 000000000..6c45c60c2 --- /dev/null +++ b/demo_etl_pipeline.ipynb @@ -0,0 +1,674 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "12ef2c5b-d603-48eb-a551-a5e36def9c2d", + "metadata": {}, + "source": [ + "# Bibliometrix ETL Pipeline - Demo Notebook\n", + "## From Heterogeneous Bibliographic Data to a Unified Schema\n", + "\n", + "This notebook demonstrates the full ETL pipeline developed for the Bibliometrix-Python project. The pipeline standardizes bibliographic data from multiple sources (Scopus, PubMed, OpenAlex) into a unified Web of Science-compatible schema.\n", + "\n", + "### Pipeline Architecture\n", + "- **Extract**: Load data from local files or REST APIs\n", + "- **Transform**: Rename columns, enforce types, handle nulls, calculate SR\n", + "- **Validate**: Check schema, types, and null values before analysis" + ] + }, + { + "cell_type": "markdown", + "id": "d081bd6b-c2ee-4fed-a5c9-8e832ac3f85e", + "metadata": {}, + "source": [ + "## 1. Setup and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "edabf0ee-54f2-4840-974d-be5f0c6885f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All modules imported successfully!\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.insert(0, r\"C:\\Users\\mlosc\\bibliometrix-python\")\n", + "import pandas as pd\n", + "from www.services.etl.transformer import transform\n", + "from www.services.etl.validator import validate\n", + "from www.services.etl.api_retriever import retrieve_openalex, retrieve_pubmed\n", + "from www.services.etl.mappings import SCOPUS_CSV_MAPPING\n", + "print(\"All modules imported successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "ab5a1dbb-93ad-40a5-9e73-5ee7efdc2fa4", + "metadata": {}, + "source": [ + "## 2. Base Level - Loading a Scopus CSV File\n", + "In this section we demonstrate the BASE LEVEL of the pipeline.\n", + "We load a manually exported CSV file from Scopus and standardized it." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e8046794-78ff-4348-b19c-b84b63523845", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw shape: (20, 45)\n", + "\n", + "Raw column names (Scopus format):\n", + "['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end', 'Cited by', 'DOI', 'Link', 'Affiliations', 'Authors with affiliations', 'Abstract', 'Author Keywords', 'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS', 'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts', 'References', 'Correspondence Address', 'Editors', 'Publisher', 'Sponsors', 'Conference name', 'Conference date', 'Conference location', 'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', 'Language of Original Document', 'Abbreviated Source Title', 'Document Type', 'Publication Stage', 'Open Access', 'Source', 'EID']\n" + ] + } + ], + "source": [ + "df_raw = pd.read_csv(\"scopus_test.csv\",encoding=\"utf-8\")\n", + "print(f\"Raw shape: {df_raw.shape}\")\n", + "print(f\"\\nRaw column names (Scopus format):\")\n", + "print(df_raw.columns.tolist())" + ] + }, + { + "cell_type": "markdown", + "id": "8a2faa93-e9fc-493e-b58a-2eb92af18b4f", + "metadata": {}, + "source": [ + "## 2.1 Transform - Applying the ETL Pipeline\n", + "We apply the mapping dictionary to rename columns to WoS tags, enforce correct data types, fill missing values, and calculate the SR field." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "de137c4b-0f66-4bd4-b3c2-09cbd123e51f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TRANSFORM] Starting transformation for source: SCOPUS\n", + "[TRANSFORM] Columns renamed.\n", + "[TRANSFORM] List columns enforced\n", + "[TRANSFORM] Integer columns enforced.\n", + "[TRANSFORM] Missing columns filled.\n", + "[TRANSFORM] Null values filled.\n", + "[TRANSFORM] SR field calculated.\n", + "[TRANSFORM] Done. Shape: (20, 47).\n", + "\n", + "Standardized shape: (20, 47)\n", + "\n", + "Standardized column names (WoS format):\n", + "['AU', 'AF', 'Author(s) ID', 'TI', 'PY', 'SO', 'VL', 'IS', 'Art. No.', 'BP', 'EP', 'TC', 'DI', 'Link', 'C1', 'Authors with affiliations', 'AB', 'DE', 'ID', 'Molecular Sequence Numbers', 'Chemicals/CAS', 'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts', 'CR', 'RP', 'Editors', 'Publisher', 'Sponsors', 'Conference name', 'Conference date', 'Conference location', 'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PMID', 'LA', 'JI', 'DT', 'Publication Stage', 'Open Access', 'Source', 'UT', 'DB', 'SR']\n" + ] + } + ], + "source": [ + "df_scopus = transform(df_raw, SCOPUS_CSV_MAPPING, \"SCOPUS\")\n", + "print(f\"\\nStandardized shape: {df_scopus.shape}\")\n", + "print(f\"\\nStandardized column names (WoS format):\")\n", + "print(df_scopus.columns.tolist())" + ] + }, + { + "cell_type": "markdown", + "id": "7f7f7c73-eb45-4f5d-ab46-d6385369cfc0", + "metadata": {}, + "source": [ + "## 2.2 Validate - Checking the Standardized DataFrame\n", + "The validator checks thaat all mandatory columns are present, no null values remain, and list columns are correctly typed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e218f9b8-793b-4ce9-a0cf-a94361b2b9c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Running ETL Validation---\n", + "[OK] All mandatory columns are present.\n", + "[OK] No null values found.\n", + "[OK] All list columns are correctly typed.\n", + "---Validation PASSED---\n", + "\n", + "Sample of key standardized columns:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TIAUPYSOTCSR
0Investigation on the suppression of Gas-Coaldu...[Bin L., Jinzhang J., YangQiang, Dongming W., ...2027Fuel0Bin L., 2027, Fuel
1Impurity effects on CO2 trapping indices: A nu...[Alkhowaildi M., Tariq Z., AlTammar M.J., Hote...2027Fuel0Alkhowaildi M., 2027, Fuel
2Fully elucidating catalyst-driven combustion m...[Wen M., Han J., Zhang X., Zhao Y., Zhang Y., ...2027Fuel0Wen M., 2027, Fuel
3Wettability of geological formations in a CO2 ...[Aboushanab M., Arif M.]2027Fuel0Aboushanab M., 2027, Fuel
4Physics-informed dual integration machine lear...[Zhang M., Zhu W., Mao T., Cao J., Meng X., Bi...2027Fuel0Zhang M., 2027, Fuel
\n", + "
" + ], + "text/plain": [ + " TI \\\n", + "0 Investigation on the suppression of Gas-Coaldu... \n", + "1 Impurity effects on CO2 trapping indices: A nu... \n", + "2 Fully elucidating catalyst-driven combustion m... \n", + "3 Wettability of geological formations in a CO2 ... \n", + "4 Physics-informed dual integration machine lear... \n", + "\n", + " AU PY SO TC \\\n", + "0 [Bin L., Jinzhang J., YangQiang, Dongming W., ... 2027 Fuel 0 \n", + "1 [Alkhowaildi M., Tariq Z., AlTammar M.J., Hote... 2027 Fuel 0 \n", + "2 [Wen M., Han J., Zhang X., Zhao Y., Zhang Y., ... 2027 Fuel 0 \n", + "3 [Aboushanab M., Arif M.] 2027 Fuel 0 \n", + "4 [Zhang M., Zhu W., Mao T., Cao J., Meng X., Bi... 2027 Fuel 0 \n", + "\n", + " SR \n", + "0 Bin L., 2027, Fuel \n", + "1 Alkhowaildi M., 2027, Fuel \n", + "2 Wen M., 2027, Fuel \n", + "3 Aboushanab M., 2027, Fuel \n", + "4 Zhang M., 2027, Fuel " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = validate(df_scopus)\n", + "print(\"\\nSample of key standardized columns:\")\n", + "df_scopus[[\"TI\",\"AU\",\"PY\",\"SO\",\"TC\",\"SR\"]].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "0d067e7a-db8d-4a8e-ba22-c6e12b7c07ea", + "metadata": {}, + "source": [ + "## 3. Advanced Level - Retrieving Data via API\n", + "Here we demonstrate the ADVANCED LEVEL of the pipeline.\n", + "Data is retrieved automatically from OpenAlex and PubMed REST APIs using a simple text query, with no manual download required." + ] + }, + { + "cell_type": "markdown", + "id": "244029a4-cac5-43b6-864f-e275e436d198", + "metadata": {}, + "source": [ + "### 3.1 OpenAlex API" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b8872656-19b5-4461-9f55-620bed659f65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[OpenAlex] Searching for: 'machine learning' (max 10 results)\n", + "[OpenAlex] Page 1: retrieved 25 records. Total so far: 10\n", + "[OpenAlex] Done. Total records retrieved: 10\n", + "\n", + "Retrieved shape: (10, 23)\n", + "\n", + "First 3 titles:\n", + " - Scikit-learn: Machine Learning in Python\n", + " - Genetic algorithms in search, optimization, and machine learning\n", + " - C4.5: Programs for Machine Learning\n" + ] + } + ], + "source": [ + "df_openalex_raw = retrieve_openalex(query=\"machine learning\", max_results=10)\n", + "print(f\"\\nRetrieved shape: {df_openalex_raw.shape}\")\n", + "print(f\"\\nFirst 3 titles:\")\n", + "for title in df_openalex_raw[\"TI\"].head(3):\n", + " print(f\" - {title}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c9114131-27f9-46b8-a897-fd8ca583621d", + "metadata": {}, + "source": [ + "### 3.2 Transform and Validate OpenAlex data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1e1622a4-6787-4d30-b8a7-7073bdf097bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TRANSFORM] Starting transformation for source: OPENALEX\n", + "[TRANSFORM] Columns renamed.\n", + "[TRANSFORM] List columns enforced\n", + "[TRANSFORM] Integer columns enforced.\n", + "[TRANSFORM] Missing columns filled.\n", + "[TRANSFORM] Null values filled.\n", + "[TRANSFORM] SR field calculated.\n", + "[TRANSFORM] Done. Shape: (10, 24).\n", + "---Running ETL Validation---\n", + "[OK] All mandatory columns are present.\n", + "[OK] No null values found.\n", + "[OK] All list columns are correctly typed.\n", + "---Validation PASSED---\n", + "\n", + "Sample of key standardized columns:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TIAUPYSOTCSR
0Scikit-learn: Machine Learning in Python[]201263678, 2012,
1Genetic algorithms in search, optimization, an...[]198949333, 1989,
2C4.5: Programs for Machine Learning[]199223696, 1992,
\n", + "
" + ], + "text/plain": [ + " TI AU PY SO TC \\\n", + "0 Scikit-learn: Machine Learning in Python [] 2012 63678 \n", + "1 Genetic algorithms in search, optimization, an... [] 1989 49333 \n", + "2 C4.5: Programs for Machine Learning [] 1992 23696 \n", + "\n", + " SR \n", + "0 , 2012, \n", + "1 , 1989, \n", + "2 , 1992, " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_openalex = transform(df_openalex_raw, {}, \"OPENALEX\")\n", + "result = validate(df_openalex)\n", + "print(\"\\nSample of key standardized columns:\")\n", + "df_openalex[[\"TI\",\"AU\",\"PY\",\"SO\",\"TC\",\"SR\"]].head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "356be294-611c-4e12-b362-14865b3c205f", + "metadata": {}, + "source": [ + "### 3.3 PubMed API" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bcec90db-76aa-4fcf-a5a8-8d363742d1b6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[PubMed] Searching for: 'machine learning' (max 10 results)\n", + "[PubMed] Found 10 IDs.\n", + "[PubMed] Fetched batch 1.Total so far: 10\n", + "[PubMed] Done. Total records retrieved: 10\n", + "\n", + "Retrieve shape: (10, 23)\n", + "\n", + "First 3 titles:\n", + " - Prediction of an fMRI-based schizophrenia biomarker from EEG using dynamic\n", + " - fNIRS Single-trial decoding improves systematically with higher optode density,\n", + " - Comprehensive analysis of m6A RNA methylation regulators and the immune\n" + ] + } + ], + "source": [ + "df_pubmed_raw = retrieve_pubmed(query=\"machine learning\", max_results=10)\n", + "print(f\"\\nRetrieve shape: {df_pubmed_raw.shape}\")\n", + "print(f\"\\nFirst 3 titles:\")\n", + "for title in df_pubmed_raw[\"TI\"].head(3):\n", + " print(f\" - {title}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7d0e6149-a384-4b49-a9d5-1034c6f96051", + "metadata": {}, + "source": [ + "### 3.4 Transform and Validate PubMed data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "86ba80c8-d695-4781-a081-b581b9a8b317", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TRANSFORM] Starting transformation for source: PUBMED\n", + "[TRANSFORM] Columns renamed.\n", + "[TRANSFORM] List columns enforced\n", + "[TRANSFORM] Integer columns enforced.\n", + "[TRANSFORM] Missing columns filled.\n", + "[TRANSFORM] Null values filled.\n", + "[TRANSFORM] SR field calculated.\n", + "[TRANSFORM] Done. Shape: (10, 24).\n", + "---Running ETL Validation---\n", + "[OK] All mandatory columns are present.\n", + "[OK] No null values found.\n", + "[OK] All list columns are correctly typed.\n", + "---Validation PASSED---\n", + "n\\Sample of key standardized columns:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TIAUPYSOTCSR
0Prediction of an fMRI-based schizophrenia biom...[Tamano R, Ogawa T, Katagiri A, Cai C, Kawanab...2026Biomedical physics & engineering express0Tamano R, 2026, Biomedical physics & engineeri...
1fNIRS Single-trial decoding improves systemati...[Fischer T, Middell E, Moradi S, von Luhmann A]2026Journal of neural engineering0Fischer T, 2026, Journal of neural engineering
2Comprehensive analysis of m6A RNA methylation ...[Liu X, Hu J, Shi G, Zhu W, Hao Q]2026Frontiers in neurology0Liu X, 2026, Frontiers in neurology
\n", + "
" + ], + "text/plain": [ + " TI \\\n", + "0 Prediction of an fMRI-based schizophrenia biom... \n", + "1 fNIRS Single-trial decoding improves systemati... \n", + "2 Comprehensive analysis of m6A RNA methylation ... \n", + "\n", + " AU PY \\\n", + "0 [Tamano R, Ogawa T, Katagiri A, Cai C, Kawanab... 2026 \n", + "1 [Fischer T, Middell E, Moradi S, von Luhmann A] 2026 \n", + "2 [Liu X, Hu J, Shi G, Zhu W, Hao Q] 2026 \n", + "\n", + " SO TC \\\n", + "0 Biomedical physics & engineering express 0 \n", + "1 Journal of neural engineering 0 \n", + "2 Frontiers in neurology 0 \n", + "\n", + " SR \n", + "0 Tamano R, 2026, Biomedical physics & engineeri... \n", + "1 Fischer T, 2026, Journal of neural engineering \n", + "2 Liu X, 2026, Frontiers in neurology " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pubmed = transform(df_pubmed_raw, {}, \"PUBMED\")\n", + "result = validate(df_pubmed)\n", + "print(\"n\\Sample of key standardized columns:\")\n", + "df_pubmed[[\"TI\",\"AU\",\"PY\",\"SO\",\"TC\",\"SR\"]].head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "1682906a-734a-436d-a702-196c1251835b", + "metadata": {}, + "source": [ + "## 4. Exporting the Standardized DataFrame to CSV\n", + "The standardized DataFrame can be exported to CSV for use with the Bibliometrix-Python analytical functions." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4c63ab59-a3ec-4a1d-96ec-71302f1eaaf7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scopus standardized CSV saved: scopus_standardized.csv\n", + "OpenAlex standardized CSV saved: openalex_standardized.csv\n", + "PubMed standardized CSV saved: pubmed_standardized.csv\n", + "\n", + "All files exported successfully!\n" + ] + } + ], + "source": [ + "df_scopus.to_csv(\"scopus_standardized.csv\", index=False)\n", + "print(\"Scopus standardized CSV saved: scopus_standardized.csv\")\n", + "df_openalex.to_csv(\"openalex_standardized.csv\", index=False)\n", + "print(\"OpenAlex standardized CSV saved: openalex_standardized.csv\")\n", + "df_pubmed.to_csv(\"pubmed_standardized.csv\", index=False)\n", + "print(\"PubMed standardized CSV saved: pubmed_standardized.csv\")\n", + "print(\"\\nAll files exported successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afa6b4a8-8275-43c9-af7d-9398a1cd117b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/www/services/__init__.py b/www/services/__init__.py index 28584e105..9a16e7bea 100644 --- a/www/services/__init__.py +++ b/www/services/__init__.py @@ -1,17 +1,2 @@ -from .biblionetwork import * -from .cocmatrix import * -from .couplingmap import * -from .format_functions import * -from .histnetwork import * -from .histplot import * -from .htmldownload import * -from .igraph2vis import * -from .metatagextraction import * -from .networkplot import * -from .parsers import * -from .plotlydownload import * -from .savereport import * -from .tabletag import * -from .termextraction import * -from .thematicmap import * -from .utils import * \ No newline at end of file +# Selective imports to avoid loading heavy dependencies automatically. +# Individual modules can still be imported directly when needed. \ No newline at end of file diff --git a/www/services/etl/___init__.py b/www/services/etl/___init__.py new file mode 100644 index 000000000..5f9941761 --- /dev/null +++ b/www/services/etl/___init__.py @@ -0,0 +1,4 @@ +from .standardizer import convert2df +from .transformer import transform +from .validator import validate +from .api_retriever import retrieve_openalex, retrieve_pubmed diff --git a/www/services/etl/api_retriever.py b/www/services/etl/api_retriever.py new file mode 100644 index 000000000..3ae35586b --- /dev/null +++ b/www/services/etl/api_retriever.py @@ -0,0 +1,281 @@ +""" +API Retriever module for the Bibliometrix ETL pipeline. +Retrieves bibliographic data from PubMed and OpenAlex REST APIs. +Handles pagination, rate limits, and retrieves automatically. +""" +import requests +import time +import pandas as pd + +OPENALEX_BASE_URL = "https://api.openalex.org/works" +PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + +MAX_RETRIES = 3 +RETRY_DELAY = 2 +PAGE_SIZE = 25 + +def _get_with_retry(url:str, params:dict) -> dict: + """ + Perform a GET request with automatic retry on failure. + Waits RETRY_DELAY seconds between attempts + Args: + url:The endpoint URL to call. + params:Query parameters to include in the request. + Returns: + The parsed JSON response as a dictionary. + Raises: + RuntimeError: If all retry attempts fail. + """ + for attempt in range(1,MAX_RETRIES+1): + try: + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + print(f"[API] Attempt {attempt}/{MAX_RETRIES} failed: {e}") + if attempt < MAX_RETRIES: + time.sleep(RETRY_DELAY) + raise RuntimeError(f"All {MAX_RETRIES} attempts failed for URL: {url}") + +def _parse_openalex_record(work:dict) -> dict: + """ + Parse a single OpenAlex work record into a flat dictionary + using the standard WoS-compatible field names. + Args: + work:A single work object from the OpenAlex API response. + Returns: + A flat dictionary with standardized field names. + """ + authors = [] + authors_full = [] + affiliations = [] + for autorship in work.get("autorship",[]): + author_name = autorship.get("author",{}).get("display_name","") + authors.append(author_name) + authors_full.append(author_name) + for inst in autorship.get("institutions",[]): + affiliations.append(inst.get("display_name","")) + abstract = "" + inverted_index = work.get("abstract_inverted_index",{}) + if inverted_index: + words = [""] * (max( + pos for psitions in inverted_index.values() + for pos in psitions + )+1) + for word, positions in inverted_index.items(): + for pos in positions: + words[pos] = word + abstract = " ".join(words) + keywords = [kw.get("display_name","") for kw in work.get("keywords",[])] + cited_refs = work.get("referenced_works", []) + source = work.get("primary_location",{}) or {} + source_info = source.get("source",{}) or {} + journal = source.get("display_name","") + journal_abbr = source.get("issn_1","") + biblio = work.get("biblio",{}) or {} + doi = work.get("doi","") or "" + doi = doi.replace("https://doi.org/","") + return { + "TI":work.get("title",""), + "AB":abstract, + "PY":str(work.get("publication_year","")), + "SO":journal, + "JI":journal_abbr, + "VL":str(biblio.get("volume","") or ""), + "IS":str(biblio.get("issue","") or ""), + "BP":str(biblio.get("first_page","") or ""), + "EP":str(biblio.get("last_page","") or ""), + "DI":doi, + "UT":work.get("id",""), + "PMID":str(work.get("ids",{}).get("pmid","") or "").replace("https://pubmed.ncbi.nlm.nih.gov/",""), + "DT":work.get("type",""), + "LA":work.get("language",""), + "TC":work.get("cited_by_count",0), + "AU":authors, + "AF":authors_full, + "C1":affiliations, + "CR":cited_refs, + "DE":keywords, + "ID":keywords, + "RP":"", + "DB":"OPENALEX" + } + +def retrieve_openalex(query:str, max_results:int = 100) -> pd.DataFrame: + """ + Retrieve bibliographic records form OpenAlex for a given query. + Handles pagination automatically. + Args: + query:The search query string (e.g. 'machine learning'). + max_results: Maximum number of records to retrieve. + Returns: + A DataFrame with no row per record, using standard WoS field names. + """ + print(f"[OpenAlex] Searching for: '{query}' (max {max_results} results)") + records = [] + page = 1 + while len(records) < max_results: + params = { + "search":query, + "per-page":PAGE_SIZE, + "page":page, + "select":"id,title,abstract_inverted_index,publication_year,primary_location,biblio,doi,ids,type,language,cited_by_count,authorships,keywords,referenced_works", + } + data = _get_with_retry(OPENALEX_BASE_URL, params) + works = data.get("results",[]) + if not works: + print("[OpenAlex] No more results.") + break + for work in works: + records.append(_parse_openalex_record(work)) + if len(records) >= max_results: + break + print(f"[OpenAlex] Page {page}: retrieved {len(works)} records. Total so far: {len(records)}") + page += 1 + time.sleep(0.5) + print(f"[OpenAlex] Done. Total records retrieved: {len(records)}") + return pd.DataFrame(records) + +def _fetch_pubmed_ids(query:str, max_results:int) -> list: + """ + Search PubMed for a query and return a list of PubMed IDs. + Args: + query:The search query string. + max_results:Maximum number of IDs to retrieve. + Returns: + A list of PubMed ID strings. + """ + params = { + "db":"pubmed", + "term":query, + "retmax":max_results, + "retmode":"json" + } + data = _get_with_retry(PUBMED_SEARCH_URL,params) + ids = data.get("esearchresult", {}).get("idlist",[]) + print(f"[PubMed] Found {len(ids)} IDs.") + return ids + +def _fetch_pubmed_records(pmids:list) -> list: + """ + Fetch full records for a list of PubMed IDs in batches. + Args: + pmids:List of PubMed ID strings. + Returns: + A list of flat dictionaries with standardized field names. + """ + records = [] + batch_size = 20 + for i in range(0,len(pmids),batch_size): + batch = pmids[i:i+batch_size] + params = { + "db":"pubmed", + "id":",".join(batch), + "retmode":"text", + "rettype":"medline" + } + for attempt in range(1, MAX_RETRIES+1): + try: + response = requests.get(PUBMED_FETCH_URL, params=params, timeout=10) + response.raise_for_status() + break + except requests.RequestException as e: + print(f"[PubMed] Attempt {attempt}/{MAX_RETRIES} failed: {e}") + if attempt < MAX_RETRIES: + time.sleep(RETRY_DELAY) + else: + raise RuntimeError(f"Failed to fetch PubMed batch starting at index {i}") + records.extend(_parse_pubmed_text(response.text)) + print(f"[PubMed] Fetched batch {i//batch_size+1}.Total so far: {len(records)}") + time.sleep(0.5) + return records + +def _medline_to_standard(record:dict) -> dict: + """ + Convert a raw MEDLINE record dictionary to standard WoS field names. + Args: + record:A raw dictionary with MEDLINE field tags as keys. + Returns: + A flat dictionary with standardized WoS field names. + """ + authors = [a.strip() for a in record.get("AU","").split(";") if a.strip()] + authors_full = [a.strip() for a in record.get("FAU","").split(";") if a.strip()] + dp = record.get("DP","") + import re + year_match = re.search(r"\d{4}", dp) + year = year_match.group(0) if year_match else "" + keywords = [k.strip().replace("*","") for k in record.get("MH","").split(";") if k.strip()] + return { + "TI":record.get("TI", ""), + "AB":record.get("AB", ""), + "PY":year, + "SO":record.get("JT", ""), + "JI":record.get("TA", ""), + "VL":record.get("VI", ""), + "IS":record.get("IP", ""), + "BP":record.get("PG", "").split("-")[0] if record.get("PG") else "", + "EP":record.get("PG", "").split("-")[-1] if record.get("PG") else "", + "DI":record.get("LID", ""), + "UT":record.get("PMID", ""), + "PMID":record.get("PMID", ""), + "DT":record.get("PT", ""), + "LA":record.get("LA", ""), + "TC":0, + "AU":authors, + "AF":authors_full, + "C1":[a.strip() for a in record.get("AD", "").split(";") if a.strip()], + "CR":[], + "DE":keywords, + "ID":keywords, + "RP":"", + "DB":"PUBMED" + } + +def _parse_pubmed_text(text:str) -> list: + """ + Parse PubMed MEDLINE text format into a list of flat dictionaries. + Args: + text:Raw MEDLINE text from the PubMed efetch API. + Returns: + A list of flat dictionaries with standardized field names. + """ + records = [] + current = {} + current_key = None + for line in text.splitlines(): + if line.strip() == "": + if current: + records.append(_medline_to_standard(current)) + current = {} + current_key = None + continue + if line[:4].strip() and line [4:6] == "- ": + current_key = line[:4].strip() + value = line[6:].strip() + if current_key in current: + current[current_key] += ";" + value + else: + current[current_key] = value + if current: + records.append(_medline_to_standard(current)) + return records + +def retrieve_pubmed(query:str, max_results:int=100) -> pd.DataFrame: + """ + Retrieve bibliographic records from PubMed for a given query. + Args: + query:The search query string (e.g. 'machine learning'). + max_results:Maximum number of records to retrieve. + Returns: + A DataFrame with one row per record, using standard WoS field names. + """ + print(f"[PubMed] Searching for: '{query}' (max {max_results} results)") + pmids = _fetch_pubmed_ids(query, max_results) + if not pmids: + print("[PubMed] No results found.") + return pd.DataFrame() + records = _fetch_pubmed_records(pmids) + print(f"[PubMed] Done. Total records retrieved: {len(records)}") + return pd.DataFrame(records) + diff --git a/www/services/etl/mappings/__init__.py b/www/services/etl/mappings/__init__.py new file mode 100644 index 000000000..b5aa1178f --- /dev/null +++ b/www/services/etl/mappings/__init__.py @@ -0,0 +1,5 @@ +from .scopus_mappin import SCOPUS_CSV_MAPPING +from .dimensions_mapping import DIMENSIONS_MAPPING +from .pubmed_mapping import PUBMED_MAPPING +from .openlex_mapping import OPENALEX_MAPPING + diff --git a/www/services/etl/mappings/dimensions_mapping.py b/www/services/etl/mappings/dimensions_mapping.py new file mode 100644 index 000000000..850359ab2 --- /dev/null +++ b/www/services/etl/mappings/dimensions_mapping.py @@ -0,0 +1,21 @@ +""" +Mapping dictionary for Dimensions exported data. +Maps raw Dimensions column names to standard WoS field tags +""" +DIMENSIONS_MAPPING = { + "Title":"TI", + "Abstract":"AB", + "PubYear":"PY", + "Source title":"SO", + "Volume":"VL", + "Issue":"IS", + "Pagination":"BP", + "DOI":"DI", + "Publication ID":"UT", + "PMID":"PMID", + "Publication Type":"DT", + "Times cited":"TC", + "Authors":"AU", + "Corresponding Authors":"RP", + "MeSH terms":"DE" +} \ No newline at end of file diff --git a/www/services/etl/mappings/openlex_mapping.py b/www/services/etl/mappings/openlex_mapping.py new file mode 100644 index 000000000..f218c047a --- /dev/null +++ b/www/services/etl/mappings/openlex_mapping.py @@ -0,0 +1,21 @@ +""" +Mapping dictionary for OpenAlex API response data. +Maps raw OpenAlex field names to standard WoS field tags. +""" +OPENALEX_MAPPING = { + "title":"TI", + "abstract":"AB", + "publication_year":"PY", + "primary_location.source.display_name":"SO", + "primary_location.source.issn_1":"JI", + "biblio.volume":"VL", + "biblio.issue":"IS", + "biblio.first_page":"BP", + "biblio.last_page":"EP", + "doi":"DI", + "id":"UT", + "ids.pmid":"PMID", + "type":"DT", + "language":"LA", + "citied_by_count":"TC" +} \ No newline at end of file diff --git a/www/services/etl/mappings/pubmed_mapping.py b/www/services/etl/mappings/pubmed_mapping.py new file mode 100644 index 000000000..d669248ce --- /dev/null +++ b/www/services/etl/mappings/pubmed_mapping.py @@ -0,0 +1,23 @@ +""" +Mapping dictionary for PubMed exported data +Maps raw PubMed field tags to standard WoS field tags. +""" +PUBMED_MAPPING = { + "TI":"TI", + "AB":"AB", + "DP":"PY", + "JT":"SO", + "TA":"JI", + "VI":"VL", + "IP":"IS", + "PG":"BP", + "LID":"DI", + "PMID":"PMID", + "PT":"DT", + "LA":"LA", + "AU":"AU", + "FAU":"AF", + "AD":"C1", + "MH":"DE", + "GR":"FU" +} \ No newline at end of file diff --git a/www/services/etl/mappings/scopus_mappin.py b/www/services/etl/mappings/scopus_mappin.py new file mode 100644 index 000000000..b6327c17e --- /dev/null +++ b/www/services/etl/mappings/scopus_mappin.py @@ -0,0 +1,29 @@ +""" +Mapping dictionary for Scopus exported data. +Maps raw Scopus column names to standard WoS tags +""" + +SCOPUS_CSV_MAPPING = { + "Title":"TI", + "Abstract":"AB", + "Year":"PY", + "Source title":"SO", + "Abbreviated Source Title":"JI", + "Volume":"VL", + "Issue":"IS", + "Page start":"BP", + "Page end":"EP", + "DOI":"DI", + "EID":"UT", + "PubMed ID":"PMID", + "Document Type":"DT", + "Language of Original Document":"LA", + "Cited by":"TC", + "Authors":"AU", + "Author full names":"AF", + "Affiliations":"C1", + "Correspondence Address":"RP", + "References":"CR", + "Author Keywords":"DE", + "Index Keywords":"ID" +} \ No newline at end of file diff --git a/www/services/etl/standardizer.py b/www/services/etl/standardizer.py new file mode 100644 index 000000000..579af14e4 --- /dev/null +++ b/www/services/etl/standardizer.py @@ -0,0 +1,117 @@ +""" +Standardizer module for the Bibliometrix ETL pipeline. +This is the main point of the pipeline, equivalent to the convert2df() function in the R version of the Bibliometrix. +Usage: + from www.services.etl.standardizer import convert2df + #From a local file: + df = convert2df(source="scopus",filepath="data/scopus_export.csv") + #From an API query: + df = convert2df(source="openlax",query="machine learning",max_results=100) +""" +import pandas as pd +from www.services.etl.transformer import transform +from www.services.etl.validator import validate +from www.services.etl.api_retriever import retrieve_openalex, retrieve_pubmed +from www.services.etl.mappings import (SCOPUS_CSV_MAPPING, DIMENSIONS_MAPPING, PUBMED_MAPPING, OPENALEX_MAPPING) + +FILE_SOURCES = { + "scopus":SCOPUS_CSV_MAPPING, + "dimension":DIMENSIONS_MAPPING, + "pubmed":PUBMED_MAPPING +} + +API_SOURCES = { + "openalex":retrieve_openalex, + "pubmed":retrieve_pubmed +} + +def extract_file(source:str,filepath:str) -> pd.DataFrame: + """ + Extract raw data from a local file based on the source type. + Supports CSV, XLSX, and TXT (PubMed MEDLINE format). + Args: + source:The source database name (e.g. 'scopus','dimensions'). + filepath:The path to the local file to load. + Returns: + A raw DataFrame loaded from the file. + Raises: + ValueError:If the source or the file type is not supported. + """ + print(f"[EXTRACT] Loading file: {filepath} (source: {source})") + if source == "scopus": + if filepath.endswith(".csv"): + return pd.read_csv(filepath,encoding="utf-8") + else: + raise ValueError(f"Scopus only supports .csv files. Go: {filepath}") + elif source == "dimensions": + if filepath.endswith(".xlsx"): + return pd.read_excel(filepath,skiprows=1) + elif filepath.endswith(".csv"): + return pd.read_csv(filepath,skiprows=1,encoding="utf-8") + else: + raise ValueError(f"Dimensions only support .csv or .xlsx files.Got: {filepath}") + elif source == "pubmed": + if filepath.endswith(".txt"): + from www.services.etl.api_retriever import _parse_pubmed_text + with open(filepath, "r", encoding="utf-8") as f: + text = f.read() + records = _parse_pubmed_text(text) + return pd.DataFrame(records) + else: + raise ValueError(f"PubMed only supports .txt files.Got: {filepath}") + else: + raise ValueError(f"Unsupported file source: '{source}'." + f"Supported sources: {list(FILE_SOURCES.keys())}") + +def convert2df( + source:str, + filepath:str=None, + query:str=None, + max_results:int=100, + run_validation:bool=True, +) -> pd.DataFrame: + """ + Main entry point of the Bibliometrix ETL pipeline. + Converts heterogeneous bibliographic data into a standardized DataFrame. + Equivalent to the convert2df() function in the R version of the Bibliometrix. + Can operate in two modes: + - FILE MODE:loads a manually exported file (Base Level) + - API MODE:retrieves data automatically via REST API (Advanced Level) + Args: + source:The data source. Supported values: + File mode: 'scopus','dimensions';'pubmed' + API mode:'openalex','pubmed' + filepath:Path to the local file (required for file mode). + query:Search query string (required for API mode). + max_results:Maximum number of records to retrieve (API mode only). + run_validation:If True, runs the validator before returning the DataFrame. + Returns: + A fully standardized pandas DataFrame ready for Bibliometrix analysis. + Raises: + ValueError: If neither filepath nor query is provided, or if the source is not supported. + Examples: + >>> df = convert2df(source="scopus",filepath="scopus_sxport.csv") + >>> df = convert2df(source="openalex",query="deep learning",max_results=50) + """ + print(f"[convert2df] Source: {source} | Mode: {'API' if query else 'FILE'}") + if query is not None: + if source not in API_SOURCES: + raise ValueError(f"API not supported for source: '{source}'. Supported API sources: {list(API_SOURCES.keys())}") + retriever = API_SOURCES[source] + df = retriever(query=query, max_results=max_results) + mapping = [] + db_name = source.upper() + elif filepath is not None: + if source not in FILE_SOURCES: + raise ValueError(f"File mode not supported for source: '{source}'. Supported file sources: {list(FILE_SOURCES.keys())}") + df = extract_file(source, filepath) + mapping = FILE_SOURCES[source] + db_name = source.upper() + else: + raise ValueError("You must provide either 'filepath' (file mode) or 'query' (API mode).") + df = transform(df,mapping,db_name) + if run_validation: + validate(df) + return df + + diff --git a/www/services/etl/transformer.py b/www/services/etl/transformer.py new file mode 100644 index 000000000..90bd374fd --- /dev/null +++ b/www/services/etl/transformer.py @@ -0,0 +1,175 @@ +""" +Transformer module for the Bibliometrix ETL pipeline. +Handles column renaming, type enforcement, null handling, +and derived field calculation. +""" +import pandas as pd +import re + +LIST_COLUMNS = ["AU","AF","C1","CR","DE","ID"] +INT_COLUMNS = ["TC"] +COLUMN_DEFAULT = { + "DB":"", + "UT":"", + "DI":"", + "PMID":"", + "TI":"", + "SO":"", + "JI":"", + "PY":"", + "DT":"", + "LA":"", + "TC":0, + "AU":[], + "AF":[], + "C1":[], + "RP":"", + "CR":[], + "DE":[], + "ID":[], + "AB":"", + "VL":"", + "IS":"", + "BP":"", + "EP":"", + "SR":"" +} + +def rename_columns(df:pd.DataFrame, mapping:dict) ->pd.DataFrame: + """ + Rename raw source columns to standard WoS field tags using a mapping dictionary. + Args: + df: The raw DataFrame from the source. + mapping: A dictionary mapping raw column names to WoS tags. + Returns: + A new DataFrame with renamed columns. + """ + existing_mapping = {k: v for k,v in mapping.items() if k in df.columns} + return df.rename(columns=existing_mapping) + +def enforce_list_columns(df:pd.DataFrame) -> pd.DataFrame: + """ + Ensure that all list columns contain Python lists of strings. + Splits string values using semicolon as delimeter. + Args: + df: The DataFrame after column renaming. + Returns: + The DataFrame with list columns properly typed. + """ + for col in LIST_COLUMNS: + if col in df.columns: + def to_list(val): + if isinstance(val,list): + return val + if pd.isna(val) or val == "" or val is None: + return [] + if isinstance(val,str): + return [item.strip() for item in val.split(";") if item.strip()] + return [str(val)] + df[col] = df[col].apply(to_list) + return df + +def enforce_int_columns(df:pd.DataFrame) -> pd.DataFrame: + """ + Ensure that integer columns are properly cast to integers. + Replaces nulls with 0. + Args: + df:The DataFrame after columns renaming. + Returns: + The DataFrame with integer columns properly typed. + """ + for col in INT_COLUMNS: + if col in df.columns: + df[col] = pd.to_numeric(df[col],errors="coerce").fillna(0).astype(int) + return df + +def fill_missing_columns(df:pd.DataFrame, db_name:str) -> pd.DataFrame: + """ + Add any missing mandatory columns with their default empty values. + Also sets the DB column to identify the data source. + Args: + df:The DataFrame after type enforcement. + db_name:The name of the source database (e.g. 'SCOPUS', 'PUBMED'). + Returns: + The DataFrame with all mandatory columns present. + """ + for col,default in COLUMN_DEFAULT.items(): + if col not in df.columns: + if isinstance(default,list): + df[col] = [[] for _ in range(len(df))] + else: + df[col] = default + df["DB"] = db_name.upper() + return df + +def fill_null_values(df:pd.DataFrame) -> pd.DataFrame: + """ + Replace all remaining NaN and None values with appropriate defaluts. + List columns get empty lists, all others get empty strings. + Args: + df:The DataFrame before final export. + Returns: + The DataFrame with no null values remaining. + """ + for col in df.columns: + if col in LIST_COLUMNS: + df[col] = df[col].apply(lambda x: x if isinstance(x,list) else []) + elif col == "TC": + df[col] = df[col].fillna(0) + else: + df[col] = df[col].fillna("") + return df + +def calculate_sr(df:pd.DataFrame) -> pd.DataFrame: + """ + Calculate the Short Reference (SR) field for each record. + Format: 'FirstAuthorSurname, PublicationYear, JournalName' + This field is used as a primary key in a citation network analyses. + Args: + df:The DataFrame with AU, PY, and SO columns populated. + Returns: + The DataFrame with the SR column filled. + """ + def build_sr(row): + authors = row.get("AU", []) + if isinstance(authors,list) and len(authors)>0: + first_author = authors[0].split(",")[0].strip() + else: + first_author = "" + year = str(row.get("PY","")).strip() + journal = str(row.get("SO","")).strip() + return f"{first_author}, {year}, {journal}" + df["SR"] = df.apply(build_sr, axis=1) + return df + +def transform(df:pd.DataFrame, mapping:dict, db_name:str) -> pd.DataFrame: + """ + Run the full transformation pipeline on a row DataFrame. + Steps: rename -> enforce types -> fill missing columns -> fill nulls -> calculate SR. + Args: + df:The raw DataFrame loaded from the source file or API. + mapping:The column mapping dictionary for this source. + db_name:The name of the source database. + Returns: + A fully standardized DataFrame ready for validation and analysis. + """ + print(f"[TRANSFORM] Starting transformation for source: {db_name}") + df = rename_columns(df,mapping) + print("[TRANSFORM] Columns renamed.") + df = enforce_list_columns(df) + print("[TRANSFORM] List columns enforced") + df = enforce_int_columns(df) + print("[TRANSFORM] Integer columns enforced.") + df = fill_missing_columns(df, db_name) + print("[TRANSFORM] Missing columns filled.") + df = fill_null_values(df) + print("[TRANSFORM] Null values filled.") + df = calculate_sr(df) + print("[TRANSFORM] SR field calculated.") + print(f"[TRANSFORM] Done. Shape: {df.shape}.") + return df + + + + + \ No newline at end of file diff --git a/www/services/etl/validator.py b/www/services/etl/validator.py new file mode 100644 index 000000000..98f881d97 --- /dev/null +++ b/www/services/etl/validator.py @@ -0,0 +1,92 @@ +""" +Validator module for the Bibliometrix ETL pipeline. +Checks that the standardized DataFrame meets the required schema +before it is passed to the analytical functions. +""" +import pandas as pd + +MANDATORY_COLUMNS = [ + "DB","UT","DI","PMID","TI","SO","JI","PY","DT","LA","TC","AU","AF","C1","RP","CR","DE","ID","AB","VL","IS","BP","EP","SR" +] + +LIST_COLUMNS = ["AU","AF","C1","CR","DE","ID"] + +def check_mandatory_columns(df: pd.DataFrame) -> list: + """ + Check that all mandatory columns are present in the DataFrame. + Args: + df:The standardized DataFrame to validate. + Returns: + A list of missing column names. Empty list means all columns are present. + """ + missing = [col for col in MANDATORY_COLUMNS if col not in df.columns] + return missing + +def check_no_nulls(df:pd.DataFrame) -> dict: + """ + Check that no Nan or None value remain in the DataFrame. + Args: + df:The standardized DataFrame to validate. + Returns: + A dictionary mapping column names to the count of null values found. + Empty dict means no nulls found + """ + null_counts = {} + for col in df.columns: + count = df[col].isna().sum() + if count > 0: + null_counts[col] = int(count) + return null_counts + +def check_list_columns(df:pd.DataFrame) -> list: + """ + Check that list columns contain actual Python lists and not strings. + Args: + df:The standardized DataFrame to validate. + Returns: + A list of column name where the type contract is violated. + """ + violations = [] + for col in LIST_COLUMNS: + if col in df.columns: + sample = df[col].dropna() + for val in sample: + if not isinstance(val,list): + violations.append(col) + break + return violations + +def validate(df: pd.DataFrame) -> bool: + """ + Run all validation checks on the standardized DataFrame. + Prints a report of any issues found. + Args: + df:The standardized DataFrame to validate. + Returns: + True if all checks pass, False if any check fails. + """ + print("---Running ETL Validation---") + passed = True + missing_cols = check_mandatory_columns(df) + if missing_cols: + print(f"[FAIL] Missing mandatory columns: {missing_cols}") + passed = False + else: + print("[OK] All mandatory columns are present.") + null_counts = check_no_nulls(df) + if null_counts: + print(f"[FAIL] Null values found: {null_counts}") + passed = False + else: + print("[OK] No null values found.") + violations = check_list_columns(df) + if violations: + print(f"[FAIL] List type violations in columns: {violations}") + passed = False + else: + print("[OK] All list columns are correctly typed.") + if passed: + print("---Validation PASSED---") + else: + print("---Validation FAILED---") + return passed From 0546739a9154da5b1445834722d78bfba40126b3 Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Fri, 5 Jun 2026 16:32:35 +0200 Subject: [PATCH 2/9] Fix file naming: rename misspelled ETL module files --- www/services/etl/{___init__.py => __init__.py} | 0 .../etl/mappings/{openlex_mapping.py => openalex_mapping.py} | 0 www/services/etl/mappings/{scopus_mappin.py => scopus_mapping.py} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename www/services/etl/{___init__.py => __init__.py} (100%) rename www/services/etl/mappings/{openlex_mapping.py => openalex_mapping.py} (100%) rename www/services/etl/mappings/{scopus_mappin.py => scopus_mapping.py} (100%) diff --git a/www/services/etl/___init__.py b/www/services/etl/__init__.py similarity index 100% rename from www/services/etl/___init__.py rename to www/services/etl/__init__.py diff --git a/www/services/etl/mappings/openlex_mapping.py b/www/services/etl/mappings/openalex_mapping.py similarity index 100% rename from www/services/etl/mappings/openlex_mapping.py rename to www/services/etl/mappings/openalex_mapping.py diff --git a/www/services/etl/mappings/scopus_mappin.py b/www/services/etl/mappings/scopus_mapping.py similarity index 100% rename from www/services/etl/mappings/scopus_mappin.py rename to www/services/etl/mappings/scopus_mapping.py From b6582b0d4d0530e22811c12ceb058007919a96d0 Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Mon, 8 Jun 2026 12:13:12 +0200 Subject: [PATCH 3/9] Fix analytical functions: replace df.get()/df.set() with pandas standard methods, and missing imports --- functions/__init__.py | 45 +------------------ .../get_affiliationproductionovertime.py | 2 +- functions/get_annualproduction.py | 5 ++- functions/get_authorlocalimpact.py | 2 +- functions/get_authorproductionovertime.py | 2 +- functions/get_averagecitations.py | 2 +- functions/get_bradfordlaw.py | 2 +- functions/get_citedcountries.py | 2 +- functions/get_citeddocuments.py | 2 +- functions/get_co_occurence_network.py | 2 +- functions/get_collaborationnetwork.py | 2 +- functions/get_correspondingauthorcountries.py | 2 +- functions/get_countriesproduction.py | 2 +- functions/get_countriesproductionovertime.py | 2 +- functions/get_data.py | 8 ++-- functions/get_factorialanalysis.py | 2 +- functions/get_filters.py | 2 +- functions/get_frequentwords.py | 2 +- functions/get_localcitedauthors.py | 2 +- functions/get_localciteddocuments.py | 2 +- functions/get_localcitedreferences.py | 2 +- functions/get_localcitedsources.py | 2 +- functions/get_lotkalaw.py | 2 +- functions/get_maininformations.py | 7 ++- functions/get_referencesspectroscopy.py | 2 +- functions/get_relevantaffiliations.py | 2 +- functions/get_relevantauthors.py | 2 +- functions/get_relevantsources.py | 2 +- functions/get_sourceslocalimpact.py | 2 +- functions/get_sourcesproduction.py | 2 +- functions/get_table.py | 4 +- functions/get_treemap.py | 2 +- functions/get_trendtopics.py | 2 +- functions/get_wordcloud.py | 2 +- functions/get_worldmapcollaboration.py | 2 +- www/services/cocmatrix.py | 2 +- www/services/couplingmap.py | 6 +-- www/services/etl/mappings/__init__.py | 4 +- www/services/histnetwork.py | 2 +- www/services/metatagextraction.py | 6 +-- www/services/termextraction.py | 2 +- www/services/thematicmap.py | 2 +- 42 files changed, 58 insertions(+), 95 deletions(-) diff --git a/functions/__init__.py b/functions/__init__.py index 20e24de36..9a16e7bea 100644 --- a/functions/__init__.py +++ b/functions/__init__.py @@ -1,43 +1,2 @@ -from .get_affiliationproductionovertime import * -from .get_annualproduction import * -from .get_authorlocalimpact import * -from .get_authorproductionovertime import * -from .get_averagecitations import * -from .get_bradfordlaw import * -from .get_citedcountries import * -from .get_citeddocuments import * -from .get_clusteringcoupling import * -from .get_correspondingauthorcountries import * -from .get_countriesproduction import * -from .get_countriesproductionovertime import * -from .get_data import * -from .get_database import * -from .get_filters import * -from .get_frequentwords import * -from .get_localcitedauthors import * -from .get_localciteddocuments import * -from .get_localcitedreferences import * -from .get_localcitedsources import * -from .get_lotkalaw import * -from .get_maininformations import * -from .get_referencesspectroscopy import * -from .get_relevantaffiliations import * -from .get_relevantauthors import * -from .get_relevantsources import * -from .get_sourceslocalimpact import * -from .get_sourcesproduction import * -from .get_status import * -from .get_table import * -from .get_threefieldplot import * -from .get_treemap import * -from .get_trendtopics import * -from .get_wordcloud import * -from .get_wordfrequency import * -from .get_co_occurence_network import * -from .get_thematicmap import * -from .get_factorialanalysis import * -from .get_historiograph import * -from .get_thematicevolution import * -from .get_cocitation import * -from .get_collaborationnetwork import * -from .get_worldmapcollaboration import * \ No newline at end of file +# Selective imports to avoid loading heavy dependencies automatically. +# Individual modules can still be imported directly when needed. \ No newline at end of file diff --git a/functions/get_affiliationproductionovertime.py b/functions/get_affiliationproductionovertime.py index e1b87f583..3b8e041fd 100644 --- a/functions/get_affiliationproductionovertime.py +++ b/functions/get_affiliationproductionovertime.py @@ -12,7 +12,7 @@ def get_affiliation_production_over_time(df, top_k_affiliations): Returns: A Plotly figure object representing the affiliation's production over time. """ - data = df.get() + data = df.copy() AFF = data["AU_UN"].dropna().apply(lambda x: [aff for aff in x if aff.strip() != ""]) nAFF = [len(aff) for aff in AFF] diff --git a/functions/get_annualproduction.py b/functions/get_annualproduction.py index dd27105c2..4c3f4b14a 100644 --- a/functions/get_annualproduction.py +++ b/functions/get_annualproduction.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_annual_production(df): @@ -11,7 +14,7 @@ def get_annual_production(df): Returns: A Plotly figure object representing the annual scientific production. """ - data = df.get() + data = df.copy() # Calculate the number of publications per year publications_per_year = data["PY"].value_counts().sort_index().reset_index() diff --git a/functions/get_authorlocalimpact.py b/functions/get_authorlocalimpact.py index 74a68e263..a1ca0946e 100644 --- a/functions/get_authorlocalimpact.py +++ b/functions/get_authorlocalimpact.py @@ -13,7 +13,7 @@ def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impac Returns: A Plotly figure object and a DataFrame of the most impactful sources. """ - df = df.get() + df = df.copy() today = pd.Timestamp.now().year # Ensure 'TC' and 'PY' are numeric diff --git a/functions/get_authorproductionovertime.py b/functions/get_authorproductionovertime.py index 65edaca96..e21a3dcad 100644 --- a/functions/get_authorproductionovertime.py +++ b/functions/get_authorproductionovertime.py @@ -16,7 +16,7 @@ def get_author_production_over_time(df, top_k_authors): table_authors_production (pd.DataFrame): Table summarizing authors' production with TC and TCpY. table_documents (pd.DataFrame): Detailed table with additional document information. """ - data = df.get() + data = df.copy() # Ensure "PY" is numeric data["PY"] = pd.to_numeric(data["PY"], errors="coerce") diff --git a/functions/get_averagecitations.py b/functions/get_averagecitations.py index d752aa9b7..fbff6a8a0 100644 --- a/functions/get_averagecitations.py +++ b/functions/get_averagecitations.py @@ -11,7 +11,7 @@ def get_average_citations(df): Returns: A Plotly figure object representing the average citations per year. """ - data = df.get() + data = df.copy() # Calculate the current year current_year = pd.Timestamp.now().year + 1 diff --git a/functions/get_bradfordlaw.py b/functions/get_bradfordlaw.py index 86580591f..d71532fbd 100644 --- a/functions/get_bradfordlaw.py +++ b/functions/get_bradfordlaw.py @@ -12,7 +12,7 @@ def get_bradford_law(df): A Plotly figure object and a DataFrame of the Bradford's Law zones. """ # Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE)) - data = df.get() + data = df.copy() source_counts = data["SO"].value_counts() # Total number of sources diff --git a/functions/get_citedcountries.py b/functions/get_citedcountries.py index ac95a8d0c..5021cf3bd 100644 --- a/functions/get_citedcountries.py +++ b/functions/get_citedcountries.py @@ -15,7 +15,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure): """ # Extract metadata tags for cited countries df = metaTagExtraction(df, "AU1_CO") - df = df.get() + df = df.copy() # Prepare the table for ranking countries tab = ( diff --git a/functions/get_citeddocuments.py b/functions/get_citeddocuments.py index 14491f74a..024db6b0d 100644 --- a/functions/get_citeddocuments.py +++ b/functions/get_citeddocuments.py @@ -15,7 +15,7 @@ def get_cited_documents(df, num_of_cited_docs, cited_docs_measure): """ # Extract metadata tags for cited documents df = metaTagExtraction(df, "SR") - df = df.get() + df = df.copy() # Prepare the table for ranking documents current_year = pd.to_datetime("today").year diff --git a/functions/get_co_occurence_network.py b/functions/get_co_occurence_network.py index ec96b143a..c665d0ccd 100644 --- a/functions/get_co_occurence_network.py +++ b/functions/get_co_occurence_network.py @@ -479,7 +479,7 @@ def field_by_year(df, field_cn, timespan=None, min_freq=2, n_items=5, remove_ter The field to analyze ('ID', 'DE', 'TI', 'AB', 'WC') """ # Get the field data - M = df.get() + M = df.copy() # Create co-occurrence matrix A = cocMatrix(df, field_cn, binary=False, remove_terms=remove_terms, synonyms=synonyms) diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py index 512ed7489..d0f477879 100644 --- a/functions/get_collaborationnetwork.py +++ b/functions/get_collaborationnetwork.py @@ -46,7 +46,7 @@ def get_collaboration_network( print("Generating collaboration network...") M = df - m = df.get() + m = df.copy() NetRefs = None Title = "" diff --git a/functions/get_correspondingauthorcountries.py b/functions/get_correspondingauthorcountries.py index 5ba9832b2..84a3cbad8 100644 --- a/functions/get_correspondingauthorcountries.py +++ b/functions/get_correspondingauthorcountries.py @@ -15,7 +15,7 @@ def get_corresponding_author_countries(df, top_k_countries): # Estrai i metadati "AU_CO" e "AU1_CO" e verifica il tipo di dati df = metaTagExtraction(df, Field="AU_CO") # Assumendo che `metaTagExtraction` sia già definita df = metaTagExtraction(df, Field="AU1_CO") - data = df.get() # Se `df` è un oggetto reattivo + data = df.copy() # Se `df` è un oggetto reattivo # Assicurati che le colonne siano di tipo stringa e rimuovi righe con valori mancanti data = data.dropna(subset=["AU1_CO", "AU_CO"]) diff --git a/functions/get_countriesproduction.py b/functions/get_countriesproduction.py index 81c0e0c34..7de516ca9 100644 --- a/functions/get_countriesproduction.py +++ b/functions/get_countriesproduction.py @@ -13,7 +13,7 @@ def get_countries_production(df): """ # Assicurati che i metadati siano stati estratti df = metaTagExtraction(df, "AU_CO") - df = df.get() + df = df.copy() # Conta le occorrenze dei paesi df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x]) diff --git a/functions/get_countriesproductionovertime.py b/functions/get_countriesproductionovertime.py index aede25bbd..984bce37a 100644 --- a/functions/get_countriesproductionovertime.py +++ b/functions/get_countriesproductionovertime.py @@ -13,7 +13,7 @@ def get_countries_production_over_time(df, top_k_countries): A Plotly figure object representing the country's production over time. """ df = metaTagExtraction(df, "AU_CO") - data = df.get() + data = df.copy() AFF = pd.Series(data["AU_CO"]).dropna().apply(lambda x: [aff.strip() for aff in x if aff.strip() != ""]) nAFF = [len(aff) for aff in AFF] diff --git a/functions/get_data.py b/functions/get_data.py index 16baed992..317187d21 100644 --- a/functions/get_data.py +++ b/functions/get_data.py @@ -37,7 +37,7 @@ def get_data(input, database, df, reset_callback=None): text = ui.p( f"{database}'s files uploaded and processed successfully! " f"{len(file)} files have been processed and combined. " - f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." + f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." ) else: # Process single file (original logic) @@ -52,12 +52,12 @@ def get_data(input, database, df, reset_callback=None): text = ui.p( f"{database}'s ZIP archive uploaded and extracted successfully! " f"Multiple files have been processed and combined. " - f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." + f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." ) else: text = ui.p( f"{database}'s file uploaded successfully! You can now proceed to analyze your data. " - f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." + f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." ) except Exception as e: text = ui.div( @@ -73,7 +73,7 @@ def get_data(input, database, df, reset_callback=None): reset_callback() text = ui.p( f"{database}'s file uploaded successfully! You can now proceed to analyze your data. " - f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." + f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." ) else: diff --git a/functions/get_factorialanalysis.py b/functions/get_factorialanalysis.py index 3324bcfb6..114da2693 100644 --- a/functions/get_factorialanalysis.py +++ b/functions/get_factorialanalysis.py @@ -74,7 +74,7 @@ def get_factorial_analysis( # Set ngrams based on word_type ngrams = int(ngram) if field in ['TI', 'AB'] else 1 - M = df.get() + M = df.copy() tab = table_tag(M, field, ngrams) if len(tab) >= 2: diff --git a/functions/get_filters.py b/functions/get_filters.py index 206c215aa..989cfa634 100644 --- a/functions/get_filters.py +++ b/functions/get_filters.py @@ -12,7 +12,7 @@ def get_filters(df): Returns: A DataFrame with additional columns for filters and metrics. """ - data = df.get() + data = df.copy() # Calculate the minimum and maximum publication years data["Min_Year"] = data["PY"].min() diff --git a/functions/get_frequentwords.py b/functions/get_frequentwords.py index 8d790ffe1..d576b8def 100644 --- a/functions/get_frequentwords.py +++ b/functions/get_frequentwords.py @@ -100,7 +100,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): """ Extract and count words from a specified field in the DataFrame. """ - M = df.get() + M = df.copy() # Remove duplicates M = M.drop_duplicates(subset='SR') diff --git a/functions/get_localcitedauthors.py b/functions/get_localcitedauthors.py index e663192bc..60dcdee8c 100644 --- a/functions/get_localcitedauthors.py +++ b/functions/get_localcitedauthors.py @@ -20,7 +20,7 @@ def get_local_cited_authors(df, num_of_cited_authors, fast_search=False): loccit = 1 df = metaTagExtraction(df, "SR") - M = df.get() + M = df.copy() # Fill missing values M['TC'] = M['TC'].fillna(0) diff --git a/functions/get_localciteddocuments.py b/functions/get_localciteddocuments.py index 1dea8d5a5..9e63f855a 100644 --- a/functions/get_localciteddocuments.py +++ b/functions/get_localciteddocuments.py @@ -14,7 +14,7 @@ def get_local_cited_documents(df, num_of_local_cited_docs, field_separator, fast A Plotly figure object and a DataFrame of the most local cited documents. """ df = metaTagExtraction(df, "SR") - M = df.get() + M = df.copy() # Determine the local citation threshold if fast_search: diff --git a/functions/get_localcitedreferences.py b/functions/get_localcitedreferences.py index 68ea11fef..80e8580c4 100644 --- a/functions/get_localcitedreferences.py +++ b/functions/get_localcitedreferences.py @@ -13,7 +13,7 @@ def get_local_cited_refs(df, num_of_cited_refs, field_separator): Returns: A Plotly figure object and a DataFrame of the most local cited sources. """ - data = df.get() + data = df.copy() if isinstance(data["CR"].iloc[0], list): # Check if the first element is a list # Flatten the 'CR' column containing lists diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py index 74b261455..be4965804 100644 --- a/functions/get_localcitedsources.py +++ b/functions/get_localcitedsources.py @@ -16,7 +16,7 @@ def get_local_cited_sources(df, num_of_cited_sources): # Extract metadata tags for cited sources df = metaTagExtraction(df, "CR_SO") - data = df.get() + data = df.copy() if isinstance(data["CR_SO"].iloc[0], list): # Check if the first element is a list # Flatten the 'CR_SO' column containing lists diff --git a/functions/get_lotkalaw.py b/functions/get_lotkalaw.py index 94545fda2..a6051f504 100644 --- a/functions/get_lotkalaw.py +++ b/functions/get_lotkalaw.py @@ -14,7 +14,7 @@ def get_lotka_law(df): """ # Calculate Lotka's Law - data = df.get() + data = df.copy() # Author Productivity (Lotka's Law) authors = pd.Series([author.strip() for sublist in data['AU'] for author in sublist]) diff --git a/functions/get_maininformations.py b/functions/get_maininformations.py index 97443abdb..64834435b 100644 --- a/functions/get_maininformations.py +++ b/functions/get_maininformations.py @@ -1,4 +1,7 @@ from www.services import * +import time +import pandas as pd +from www.services.metatagextraction import metaTagExtraction def get_main_informations(df, log=False): @@ -12,7 +15,7 @@ def get_main_informations(df, log=False): Returns: A DataFrame with additional columns for filters and metrics. """ - data = df.get() + data = df.copy() #### Min and Max Year #### start_time = time.time() @@ -99,7 +102,7 @@ def count_authors(entry): if "AU_CO" not in data.columns: # Extract the required metadata df = metaTagExtraction(df, "AU_CO") - data = df.get() + data = df.copy() # Calculate "Country_Count" with a vectorized function data["Country_Count"] = data["AU_CO"].apply(lambda x: len(set(x))) diff --git a/functions/get_referencesspectroscopy.py b/functions/get_referencesspectroscopy.py index a2c3e1522..d923b7c30 100644 --- a/functions/get_referencesspectroscopy.py +++ b/functions/get_referencesspectroscopy.py @@ -16,7 +16,7 @@ def get_references_spectroscopy(df, start_year, end_year=2005, field_separator_s rpys_table (pd.DataFrame): Table with RPYS data (years, citations, deviation from median, top references). cr_table (pd.DataFrame): Table of cited references with local citation counts and Google Scholar links. """ - df = df.get() + df = df.copy() # Pulizia e preparazione dei dati c_references = df['CR'].apply(lambda x: [i for i in x]).explode() diff --git a/functions/get_relevantaffiliations.py b/functions/get_relevantaffiliations.py index b86e36509..981366a9b 100644 --- a/functions/get_relevantaffiliations.py +++ b/functions/get_relevantaffiliations.py @@ -13,7 +13,7 @@ def get_relevant_affiliations(df, num_of_affiliations, disambiguation): Returns: A Plotly figure object and a DataFrame of the most relevant authors. """ - data = df.get() + data = df.copy() if disambiguation == "yes": # Extract affiliations from the "AU_UN" field diff --git a/functions/get_relevantauthors.py b/functions/get_relevantauthors.py index cdf960151..dc0e8f922 100644 --- a/functions/get_relevantauthors.py +++ b/functions/get_relevantauthors.py @@ -13,7 +13,7 @@ def get_relevant_authors(df, num_of_authors, frequency="N. of Documents"): Returns: A Plotly figure object and a DataFrame of the most relevant authors. """ - data = df.get() + data = df.copy() # Drop rows with missing values data = data.dropna(subset=["AU"]) diff --git a/functions/get_relevantsources.py b/functions/get_relevantsources.py index dccd8d3e5..377b0db5f 100644 --- a/functions/get_relevantsources.py +++ b/functions/get_relevantsources.py @@ -12,7 +12,7 @@ def get_relevant_sources(df, num_of_sources): Returns: A Plotly figure object and a DataFrame of the most relevant sources. """ - data = df.get() + data = df.copy() # Drop rows with missing values data = data.dropna(subset=["SO"]) diff --git a/functions/get_sourceslocalimpact.py b/functions/get_sourceslocalimpact.py index 731c97194..ac14f9029 100644 --- a/functions/get_sourceslocalimpact.py +++ b/functions/get_sourceslocalimpact.py @@ -13,7 +13,7 @@ def get_sources_local_impact(df, num_of_sources_local_impact, source_local_impac Returns: A Plotly figure object and a DataFrame of the most impactful sources. """ - df = df.get() + df = df.copy() today = pd.Timestamp.now().year # Ensure 'TC' and 'PY' are numeric diff --git a/functions/get_sourcesproduction.py b/functions/get_sourcesproduction.py index 0795668d7..9bcda1fd8 100644 --- a/functions/get_sourcesproduction.py +++ b/functions/get_sourcesproduction.py @@ -13,7 +13,7 @@ def get_sources_production(df, num_of_sources_production, occurences): Returns: A Plotly figure object representing the sources' production over time. """ - data = df.get() + data = df.copy() # Calculate the number of publications per year for each source WSO = cocMatrix(df, Field="SO") diff --git a/functions/get_table.py b/functions/get_table.py index 75b9c91d8..270e7bba3 100644 --- a/functions/get_table.py +++ b/functions/get_table.py @@ -79,7 +79,7 @@ def get_table(database, df, dpi=300, filter=False, modal=True): A DataTable object if data is available, otherwise a message indicating no data. """ # Retrieve the data from the DataFrame - data = df.get() + data = df.copy() table_html = "" fig = None @@ -205,7 +205,7 @@ def get_table(database, df, dpi=300, filter=False, modal=True): # Return a DataTable object with the data and the HTML/Plotly tables return ui.HTML( DT( - df.get(), + df.copy(), maxBytes="10MB", classes="display compact stripe", style="text-transform: uppercase; font-size: small; table-layout: auto;", diff --git a/functions/get_treemap.py b/functions/get_treemap.py index 1f3f765f0..8d2b35163 100644 --- a/functions/get_treemap.py +++ b/functions/get_treemap.py @@ -75,7 +75,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): """ Extract and count words from a specified field in the DataFrame. """ - M = df.get() + M = df.copy() # Remove duplicates M = M.drop_duplicates(subset='SR') diff --git a/functions/get_trendtopics.py b/functions/get_trendtopics.py index 1d2f1df3a..145545613 100644 --- a/functions/get_trendtopics.py +++ b/functions/get_trendtopics.py @@ -99,7 +99,7 @@ def field_by_year(df, field, timespan, min_freq, n_items, remove_terms=None, syn # Create co-occurrence matrix A = cocMatrix(df, Field=field, binary=False, remove_terms=remove_terms, synonyms=synonyms) n = A.sum(axis=0).to_numpy() # Convert to 1D array - df = df.get() + df = df.copy() # Calculate quantiles trend_med = pd.DataFrame(A.values).apply(lambda x: pd.Series(np.round(np.quantile(np.repeat(df['PY'], x), [0.25, 0.5, 0.75]))), axis=0).T diff --git a/functions/get_wordcloud.py b/functions/get_wordcloud.py index e902f3bd6..d30a3f8db 100644 --- a/functions/get_wordcloud.py +++ b/functions/get_wordcloud.py @@ -106,7 +106,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): """ Extract and count words from a specified field in the DataFrame. """ - M = df.get() + M = df.copy() # Remove duplicates M = M.drop_duplicates(subset='SR') diff --git a/functions/get_worldmapcollaboration.py b/functions/get_worldmapcollaboration.py index 9edafa879..c10fd8a54 100644 --- a/functions/get_worldmapcollaboration.py +++ b/functions/get_worldmapcollaboration.py @@ -10,7 +10,7 @@ def get_world_map_collaboration(df, edges_min=1, edgesize=5): # Estrai metadati dai paesi (assumi che tu abbia già AU_CO processato) M = df df = metaTagExtraction(df, "AU_CO") - df = df.get() + df = df.copy() # Normalizza e conta le occorrenze dei paesi (come in get_countries_production) df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x]) diff --git a/www/services/cocmatrix.py b/www/services/cocmatrix.py index f523aed67..047995fab 100644 --- a/www/services/cocmatrix.py +++ b/www/services/cocmatrix.py @@ -19,7 +19,7 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short Returns: A bipartite network matrix with cases corresponding to manuscripts and variables to the objects extracted from the Tag Field. """ - M = df.get() + M = df.copy() if "LABEL" not in M.columns: M.index = M["SR"] diff --git a/www/services/couplingmap.py b/www/services/couplingmap.py index a2b3628d7..e06467898 100644 --- a/www/services/couplingmap.py +++ b/www/services/couplingmap.py @@ -16,7 +16,7 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, return None df = metaTagExtraction(df, "SR") # serve questo per avere il merging perfetto per uniformare la colonna SR - M = df.get() + M = df.copy() ngrams = int(ngrams) minfreq = max(0, int(minfreq * len(M) // 1000)) @@ -436,7 +436,7 @@ def labeling(df, df_lab, term, n, n_labels, analysis, ngrams): # Se il termine è TI o AB, estrai termini if term in ["TI", "AB"]: df = term_extraction(reactive.Value(df), field=term, ngrams=ngrams, verbose=False) - df = df.get() + df = df.copy() term = f"{term}_TM" # Normalizzazione delle stringhe per evitare errori di merge @@ -517,7 +517,7 @@ def best_lab(df, tab_global, n_labels, term): def localCitations(df, fast_search=False, sep=";"): df = metaTagExtraction(df, "SR") - M = df.get() + M = df.copy() M['TC'] = M['TC'].fillna(0) if fast_search: loccit = M['TC'].quantile(0.75) diff --git a/www/services/etl/mappings/__init__.py b/www/services/etl/mappings/__init__.py index b5aa1178f..ca3638c31 100644 --- a/www/services/etl/mappings/__init__.py +++ b/www/services/etl/mappings/__init__.py @@ -1,5 +1,5 @@ -from .scopus_mappin import SCOPUS_CSV_MAPPING +from .scopus_mapping import SCOPUS_CSV_MAPPING from .dimensions_mapping import DIMENSIONS_MAPPING from .pubmed_mapping import PUBMED_MAPPING -from .openlex_mapping import OPENALEX_MAPPING +from .openalex_mapping import OPENALEX_MAPPING diff --git a/www/services/histnetwork.py b/www/services/histnetwork.py index 7848d9744..dee14c14b 100644 --- a/www/services/histnetwork.py +++ b/www/services/histnetwork.py @@ -19,7 +19,7 @@ def histNetwork(df, min_citations=0, sep=";", network=True): - M: A DataFrame containing the metadata of the papers with the Local Citation Score (LCS). - LCS: A list containing the Local Citation Score of each paper. """ - M = df.get() + M = df.copy() db = M['DB'][0] # Ensure required fields are present diff --git a/www/services/metatagextraction.py b/www/services/metatagextraction.py index 5e1f8b9c8..94efbca82 100644 --- a/www/services/metatagextraction.py +++ b/www/services/metatagextraction.py @@ -14,7 +14,7 @@ def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False): Returns: A DataFrame with the extracted metadata tags. """ - M = df.get() + M = df.copy() if Field == "SR": M = SR(M) @@ -40,10 +40,8 @@ def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False): ind = M["AU1_UN"].str.find("),") a = ind[ind > -1].index M.loc[a, "AU1_UN"] = M.loc[a, "AU1_UN"].str[ind[a] + 2:] - - df.set(M) - return df + return M def SR(M): diff --git a/www/services/termextraction.py b/www/services/termextraction.py index f7d9a52c1..95bd7040b 100644 --- a/www/services/termextraction.py +++ b/www/services/termextraction.py @@ -20,7 +20,7 @@ def term_extraction(df, field="TI", ngrams=1, stemming=False, language="english" Returns: A DataFrame with the extracted terms. """ - M = df.get() + M = df.copy() # Load and update stopwords overall_start_time = time.time() diff --git a/www/services/thematicmap.py b/www/services/thematicmap.py index 3c313b7f6..413e1e3c2 100644 --- a/www/services/thematicmap.py +++ b/www/services/thematicmap.py @@ -7,7 +7,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False): # df = metaTagExtraction(df, field=field) M = df - m = df.get() + m = df.copy() # Set ngrams based on field ngrams = int(ngrams) if field in ['TI', 'AB'] else 1 From a257aefb81415ea53c93d9593a6440e2a9181bd9 Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Mon, 8 Jun 2026 17:44:07 +0200 Subject: [PATCH 4/9] Fix analytical functions: add missing imports (numpy, plotly, collections, geopandas, matatagextraction) --- functions/get_affiliationproductionovertime.py | 3 +++ functions/get_annualproduction.py | 1 + functions/get_authorlocalimpact.py | 3 +++ functions/get_authorproductionovertime.py | 3 +++ functions/get_averagecitations.py | 4 +++- functions/get_bradfordlaw.py | 3 +++ functions/get_citedcountries.py | 3 +++ functions/get_citeddocuments.py | 3 +++ functions/get_clusteringcoupling.py | 3 +++ functions/get_co_occurence_network.py | 3 +++ functions/get_cocitation.py | 3 +++ functions/get_collaborationnetwork.py | 4 ++++ functions/get_correspondingauthorcountries.py | 3 +++ functions/get_countriesproduction.py | 6 +++++- functions/get_countriesproductionovertime.py | 3 +++ functions/get_data.py | 3 +++ functions/get_database.py | 3 +++ functions/get_factorialanalysis.py | 3 +++ functions/get_filters.py | 3 +++ functions/get_frequentwords.py | 4 ++++ functions/get_historiograph.py | 3 +++ functions/get_localcitedauthors.py | 3 +++ functions/get_localciteddocuments.py | 3 +++ functions/get_localcitedreferences.py | 3 +++ functions/get_localcitedsources.py | 3 +++ functions/get_lotkalaw.py | 5 ++++- functions/get_maininformations.py | 2 ++ functions/get_referencesspectroscopy.py | 3 +++ functions/get_relevantaffiliations.py | 3 +++ functions/get_relevantauthors.py | 3 +++ functions/get_relevantsources.py | 4 ++++ functions/get_sourceslocalimpact.py | 3 +++ functions/get_sourcesproduction.py | 3 +++ functions/get_status.py | 3 +++ functions/get_table.py | 3 +++ functions/get_thematicevolution.py | 3 +++ functions/get_thematicmap.py | 3 +++ functions/get_threefieldplot.py | 3 +++ functions/get_treemap.py | 3 +++ functions/get_trendtopics.py | 3 +++ functions/get_wordcloud.py | 3 +++ functions/get_wordfrequency.py | 3 +++ functions/get_worldmapcollaboration.py | 4 ++-- 43 files changed, 131 insertions(+), 5 deletions(-) diff --git a/functions/get_affiliationproductionovertime.py b/functions/get_affiliationproductionovertime.py index 3b8e041fd..b9a281d8f 100644 --- a/functions/get_affiliationproductionovertime.py +++ b/functions/get_affiliationproductionovertime.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_affiliation_production_over_time(df, top_k_affiliations): diff --git a/functions/get_annualproduction.py b/functions/get_annualproduction.py index 4c3f4b14a..3eabe932e 100644 --- a/functions/get_annualproduction.py +++ b/functions/get_annualproduction.py @@ -4,6 +4,7 @@ import plotly.graph_objects as go + def get_annual_production(df): """ Generate a plot of annual scientific production. diff --git a/functions/get_authorlocalimpact.py b/functions/get_authorlocalimpact.py index a1ca0946e..49341daee 100644 --- a/functions/get_authorlocalimpact.py +++ b/functions/get_authorlocalimpact.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impact): diff --git a/functions/get_authorproductionovertime.py b/functions/get_authorproductionovertime.py index e21a3dcad..d0d7d177c 100644 --- a/functions/get_authorproductionovertime.py +++ b/functions/get_authorproductionovertime.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_author_production_over_time(df, top_k_authors): diff --git a/functions/get_averagecitations.py b/functions/get_averagecitations.py index fbff6a8a0..fc5905d80 100644 --- a/functions/get_averagecitations.py +++ b/functions/get_averagecitations.py @@ -1,5 +1,7 @@ from www.services import * - +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_average_citations(df): """ diff --git a/functions/get_bradfordlaw.py b/functions/get_bradfordlaw.py index d71532fbd..c9b625a10 100644 --- a/functions/get_bradfordlaw.py +++ b/functions/get_bradfordlaw.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_bradford_law(df): diff --git a/functions/get_citedcountries.py b/functions/get_citedcountries.py index 5021cf3bd..13fffff31 100644 --- a/functions/get_citedcountries.py +++ b/functions/get_citedcountries.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_cited_countries(df, num_of_cited_countries, cited_countries_measure): diff --git a/functions/get_citeddocuments.py b/functions/get_citeddocuments.py index 024db6b0d..56ef36cd1 100644 --- a/functions/get_citeddocuments.py +++ b/functions/get_citeddocuments.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_cited_documents(df, num_of_cited_docs, cited_docs_measure): diff --git a/functions/get_clusteringcoupling.py b/functions/get_clusteringcoupling.py index 8263a46b3..6ed04b8ae 100644 --- a/functions/get_clusteringcoupling.py +++ b/functions/get_clusteringcoupling.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_clustering_coupling(df, unit_of_analysis, coupling_measured, stemmer, impact_measure, diff --git a/functions/get_co_occurence_network.py b/functions/get_co_occurence_network.py index c665d0ccd..09630fe54 100644 --- a/functions/get_co_occurence_network.py +++ b/functions/get_co_occurence_network.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes, diff --git a/functions/get_cocitation.py b/functions/get_cocitation.py index 8bad105c0..b4ea191b6 100644 --- a/functions/get_cocitation.py +++ b/functions/get_cocitation.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_co_citation( diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py index d0f477879..a88b14d11 100644 --- a/functions/get_collaborationnetwork.py +++ b/functions/get_collaborationnetwork.py @@ -1,5 +1,9 @@ from www.services import * import json +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + def get_collaboration_network( df, field, network_layout, clustering_algorithm, repulsion, shape, opacity, shadow, curved, colnormalize, labelsize, edgesize, label_cex, nodes, isolates, edges_min diff --git a/functions/get_correspondingauthorcountries.py b/functions/get_correspondingauthorcountries.py index 84a3cbad8..774a93a77 100644 --- a/functions/get_correspondingauthorcountries.py +++ b/functions/get_correspondingauthorcountries.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_corresponding_author_countries(df, top_k_countries): diff --git a/functions/get_countriesproduction.py b/functions/get_countriesproduction.py index 7de516ca9..a9550a9e8 100644 --- a/functions/get_countriesproduction.py +++ b/functions/get_countriesproduction.py @@ -1,5 +1,9 @@ from www.services import * - +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from www.services.metatagextraction import metaTagExtraction +import geopandas as gpd def get_countries_production(df): """ diff --git a/functions/get_countriesproductionovertime.py b/functions/get_countriesproductionovertime.py index 984bce37a..662733890 100644 --- a/functions/get_countriesproductionovertime.py +++ b/functions/get_countriesproductionovertime.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_countries_production_over_time(df, top_k_countries): diff --git a/functions/get_data.py b/functions/get_data.py index 317187d21..a50e9aa2b 100644 --- a/functions/get_data.py +++ b/functions/get_data.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_data(input, database, df, reset_callback=None): diff --git a/functions/get_database.py b/functions/get_database.py index 5c5d4edc5..37a847fc0 100644 --- a/functions/get_database.py +++ b/functions/get_database.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_database(input): diff --git a/functions/get_factorialanalysis.py b/functions/get_factorialanalysis.py index 114da2693..1176e5c68 100644 --- a/functions/get_factorialanalysis.py +++ b/functions/get_factorialanalysis.py @@ -1,5 +1,8 @@ from www.services import * from scipy.spatial import ConvexHull, QhullError +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def distance_to_y(dist, max_dist, scale_factor): norm = math.log1p(dist) / math.log1p(max_dist) diff --git a/functions/get_filters.py b/functions/get_filters.py index 989cfa634..43ee1deeb 100644 --- a/functions/get_filters.py +++ b/functions/get_filters.py @@ -1,5 +1,8 @@ from www.services import * from functions.get_table import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_filters(df): diff --git a/functions/get_frequentwords.py b/functions/get_frequentwords.py index d576b8def..08a7d4d82 100644 --- a/functions/get_frequentwords.py +++ b/functions/get_frequentwords.py @@ -1,4 +1,8 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +from collections import Counter def get_frequent_words(df, ngram, num_of_words, word_type, file_upload_terms, file_upload_synonyms, field_separator_frequent=';'): diff --git a/functions/get_historiograph.py b/functions/get_historiograph.py index 089d02387..511ab2fc9 100644 --- a/functions/get_historiograph.py +++ b/functions/get_historiograph.py @@ -2,10 +2,13 @@ from pyvis.network import Network import tempfile import pandas as pd +import plotly.express as px +import plotly.graph_objects as go import networkx as nx import os from matplotlib.colors import to_rgba + def hex_to_rgba(hex_color, alpha): if not isinstance(hex_color, str) or not hex_color.startswith("#") or len(hex_color) != 7: hex_color = "#999999" # fallback grigio neutro diff --git a/functions/get_localcitedauthors.py b/functions/get_localcitedauthors.py index 60dcdee8c..2dc6aff78 100644 --- a/functions/get_localcitedauthors.py +++ b/functions/get_localcitedauthors.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_authors(df, num_of_cited_authors, fast_search=False): diff --git a/functions/get_localciteddocuments.py b/functions/get_localciteddocuments.py index 9e63f855a..0a8d7c4d4 100644 --- a/functions/get_localciteddocuments.py +++ b/functions/get_localciteddocuments.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_documents(df, num_of_local_cited_docs, field_separator, fast_search=False): diff --git a/functions/get_localcitedreferences.py b/functions/get_localcitedreferences.py index 80e8580c4..03efd7b77 100644 --- a/functions/get_localcitedreferences.py +++ b/functions/get_localcitedreferences.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_refs(df, num_of_cited_refs, field_separator): diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py index be4965804..74fe04864 100644 --- a/functions/get_localcitedsources.py +++ b/functions/get_localcitedsources.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_sources(df, num_of_cited_sources): diff --git a/functions/get_lotkalaw.py b/functions/get_lotkalaw.py index a6051f504..4307f0903 100644 --- a/functions/get_lotkalaw.py +++ b/functions/get_lotkalaw.py @@ -1,5 +1,8 @@ from www.services import * - +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import numpy as np def get_lotka_law(df): """ diff --git a/functions/get_maininformations.py b/functions/get_maininformations.py index 64834435b..befc35f20 100644 --- a/functions/get_maininformations.py +++ b/functions/get_maininformations.py @@ -1,6 +1,8 @@ from www.services import * import time import pandas as pd +import plotly.express as px +import plotly.graph_objects as go from www.services.metatagextraction import metaTagExtraction diff --git a/functions/get_referencesspectroscopy.py b/functions/get_referencesspectroscopy.py index d923b7c30..e77bb129d 100644 --- a/functions/get_referencesspectroscopy.py +++ b/functions/get_referencesspectroscopy.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_references_spectroscopy(df, start_year, end_year=2005, field_separator_spec=';'): diff --git a/functions/get_relevantaffiliations.py b/functions/get_relevantaffiliations.py index 981366a9b..67baef2be 100644 --- a/functions/get_relevantaffiliations.py +++ b/functions/get_relevantaffiliations.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_relevant_affiliations(df, num_of_affiliations, disambiguation): diff --git a/functions/get_relevantauthors.py b/functions/get_relevantauthors.py index dc0e8f922..c154b9c2f 100644 --- a/functions/get_relevantauthors.py +++ b/functions/get_relevantauthors.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_relevant_authors(df, num_of_authors, frequency="N. of Documents"): diff --git a/functions/get_relevantsources.py b/functions/get_relevantsources.py index 377b0db5f..2498bedba 100644 --- a/functions/get_relevantsources.py +++ b/functions/get_relevantsources.py @@ -1,4 +1,8 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go + def get_relevant_sources(df, num_of_sources): diff --git a/functions/get_sourceslocalimpact.py b/functions/get_sourceslocalimpact.py index ac14f9029..9af522c33 100644 --- a/functions/get_sourceslocalimpact.py +++ b/functions/get_sourceslocalimpact.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_sources_local_impact(df, num_of_sources_local_impact, source_local_impact): diff --git a/functions/get_sourcesproduction.py b/functions/get_sourcesproduction.py index 9bcda1fd8..bc2ba1ee3 100644 --- a/functions/get_sourcesproduction.py +++ b/functions/get_sourcesproduction.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_sources_production(df, num_of_sources_production, occurences): diff --git a/functions/get_status.py b/functions/get_status.py index b5c412e67..16c449ca7 100644 --- a/functions/get_status.py +++ b/functions/get_status.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_status(missing_percentage): diff --git a/functions/get_table.py b/functions/get_table.py index 270e7bba3..9e32be5bf 100644 --- a/functions/get_table.py +++ b/functions/get_table.py @@ -1,5 +1,8 @@ from www.services import * from functions.get_status import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go # Function to create a Plotly table visualization for metadata completeness diff --git a/functions/get_thematicevolution.py b/functions/get_thematicevolution.py index 65bb0077b..c1288b8bf 100644 --- a/functions/get_thematicevolution.py +++ b/functions/get_thematicevolution.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_thematic_evolution(df, field="ID", years=None, n=250, weight_index="inc_index", min_weight_index=0.1, minFreq=2, diff --git a/functions/get_thematicmap.py b/functions/get_thematicmap.py index 68d1f37d6..d0d4bfd71 100644 --- a/functions/get_thematicmap.py +++ b/functions/get_thematicmap.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False): diff --git a/functions/get_threefieldplot.py b/functions/get_threefieldplot.py index b7a4a1514..7495cf1dd 100644 --- a/functions/get_threefieldplot.py +++ b/functions/get_threefieldplot.py @@ -1,5 +1,8 @@ from www.services import * import textwrap +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_three_field_plot(df, left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items): diff --git a/functions/get_treemap.py b/functions/get_treemap.py index 8d2b35163..3725292fd 100644 --- a/functions/get_treemap.py +++ b/functions/get_treemap.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_treemap(df, ngram, num_of_words, word_type, file_upload_terms, file_upload_synonyms, field_separator_frequent=';'): diff --git a/functions/get_trendtopics.py b/functions/get_trendtopics.py index 145545613..8d853553f 100644 --- a/functions/get_trendtopics.py +++ b/functions/get_trendtopics.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_trend_topics(df, ngram, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_minimum_frequency, number_of_words_year): diff --git a/functions/get_wordcloud.py b/functions/get_wordcloud.py index d30a3f8db..5aa3b0c6a 100644 --- a/functions/get_wordcloud.py +++ b/functions/get_wordcloud.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def is_legible_on_white(color): diff --git a/functions/get_wordfrequency.py b/functions/get_wordfrequency.py index 1f2b81a06..ca1dd76d3 100644 --- a/functions/get_wordfrequency.py +++ b/functions/get_wordfrequency.py @@ -1,4 +1,7 @@ from www.services import * +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go def get_word_frequency(df, ngram, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words): diff --git a/functions/get_worldmapcollaboration.py b/functions/get_worldmapcollaboration.py index c10fd8a54..c6319911b 100644 --- a/functions/get_worldmapcollaboration.py +++ b/functions/get_worldmapcollaboration.py @@ -1,10 +1,10 @@ from www.services import * import pandas as pd -import geopandas as gpd -import networkx as nx import plotly.express as px import plotly.graph_objects as go +import geopandas as gpd +import networkx as nx def get_world_map_collaboration(df, edges_min=1, edgesize=5): # Estrai metadati dai paesi (assumi che tu abbia già AU_CO processato) From c7d4533019b9fcd7ec7d687de8498113db67a8ea Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Mon, 8 Jun 2026 18:11:00 +0200 Subject: [PATCH 5/9] Fix cocmatrix.py: cast field values to str before split; add missing imports in bradfordlaw and sourcesproduction --- functions/get_bradfordlaw.py | 2 +- functions/get_sourcesproduction.py | 1 + www/services/cocmatrix.py | 4 ++-- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/functions/get_bradfordlaw.py b/functions/get_bradfordlaw.py index c9b625a10..04303b4de 100644 --- a/functions/get_bradfordlaw.py +++ b/functions/get_bradfordlaw.py @@ -2,7 +2,7 @@ import pandas as pd import plotly.express as px import plotly.graph_objects as go - +import numpy as np def get_bradford_law(df): """ diff --git a/functions/get_sourcesproduction.py b/functions/get_sourcesproduction.py index bc2ba1ee3..5b933575b 100644 --- a/functions/get_sourcesproduction.py +++ b/functions/get_sourcesproduction.py @@ -2,6 +2,7 @@ import pandas as pd import plotly.express as px import plotly.graph_objects as go +from www. services.cocmatrix import cocMatrix def get_sources_production(df, num_of_sources_production, occurences): diff --git a/www/services/cocmatrix.py b/www/services/cocmatrix.py index 047995fab..0da83cb2f 100644 --- a/www/services/cocmatrix.py +++ b/www/services/cocmatrix.py @@ -28,7 +28,7 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short # REMOVE TERMS AND MERGE SYNONYMS if Field in ["ID", "DE", "TI", "TI_TM", "AB", "AB_TM"]: - Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)]) + Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in str(x).split(sep)]) TERMS = pd.DataFrame({"item": [item.upper() for sublist in Fi for item in sublist], "SR": M.index.repeat(Fi.str.len())}) # Merge synonyms @@ -49,7 +49,7 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short M["CR"] = M["CR"].apply(lambda x: [ref.replace("DOI;", "DOI ") for ref in x] if isinstance(x, list) else x) if Field in M.columns: - Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)]) + Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in str(x).split(sep)]) else: print(f"Field {Field} is not a column name of input data frame") return From e77e018c82db7ced985b453329e8607b13177e52 Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Tue, 9 Jun 2026 13:49:40 +0200 Subject: [PATCH 6/9] Fix dashboard functions: add missing imports (numpy, histNetwork, biblionetwork, cocMatrix, term_extraction), fix megaTagExtraction overwrite bug, fix NaN in plotly marker size --- app.py | 199 +++++++++--------- functions/__init__.py | 45 +++- .../get_affiliationproductionovertime.py | 7 + functions/get_annualproduction.py | 7 + functions/get_authorlocalimpact.py | 8 + functions/get_authorproductionovertime.py | 7 + functions/get_averagecitations.py | 7 + functions/get_bradfordlaw.py | 7 + functions/get_citedcountries.py | 7 + functions/get_citeddocuments.py | 7 + functions/get_clusteringcoupling.py | 7 + functions/get_co_occurence_network.py | 7 + functions/get_cocitation.py | 7 + functions/get_collaborationnetwork.py | 7 + functions/get_correspondingauthorcountries.py | 7 + functions/get_countriesproduction.py | 12 ++ functions/get_countriesproductionovertime.py | 7 + functions/get_data.py | 19 +- functions/get_database.py | 7 + functions/get_factorialanalysis.py | 8 + functions/get_filters.py | 7 + functions/get_frequentwords.py | 7 + functions/get_historiograph.py | 7 + functions/get_localcitedauthors.py | 13 +- functions/get_localciteddocuments.py | 7 + functions/get_localcitedreferences.py | 7 + functions/get_localcitedsources.py | 7 + functions/get_lotkalaw.py | 7 + functions/get_maininformations.py | 15 +- functions/get_referencesspectroscopy.py | 7 + functions/get_relevantaffiliations.py | 7 + functions/get_relevantauthors.py | 7 + functions/get_relevantsources.py | 7 + functions/get_sourceslocalimpact.py | 7 + functions/get_sourcesproduction.py | 7 + functions/get_status.py | 7 + functions/get_table.py | 19 +- functions/get_thematicevolution.py | 7 + functions/get_thematicmap.py | 7 + functions/get_threefieldplot.py | 7 + functions/get_treemap.py | 7 + functions/get_trendtopics.py | 7 + functions/get_wordcloud.py | 7 + functions/get_wordfrequency.py | 7 + functions/get_worldmapcollaboration.py | 7 + 45 files changed, 479 insertions(+), 111 deletions(-) diff --git a/app.py b/app.py index f0891f894..daa39d125 100644 --- a/app.py +++ b/app.py @@ -55,14 +55,19 @@ import pandas as pd import io from functions import * -from www.services import * +from www.services.utils import * from google import genai -from shiny import express -from shiny import render, ui from google.genai import types -from shiny import reactive, render -from shinywidgets import render_widget +from shiny import reactive +from shiny import express from shiny.express import ui, input, render +from shinywidgets import render_widget +from functions.get_database import get_database +from functions.get_data import get_data +from functions.get_table import get_table +from functions.get_filters import get_filtered_table +from www.services.savereport import add_to_report +from functions.get_filters import get_filtered_table # Setup the Directory for static assets - optimized for performance base_dir = tempfile.gettempdir() # Use system temp dir instead of creating new temp file @@ -760,7 +765,7 @@ def show_data(): @render.ui @reactive.event(input.start_button) def show_table(): - table_ui, _, _ = get_table(database, df) + table_ui, _, _ = get_table(database, df.get()) return table_ui # -------- ADVICE BUTTON -------- @@ -788,7 +793,7 @@ def close_advice_notification(): @render.ui @reactive.event(input.report_modal_completeness) def show_missing_data_report(): - _, missingData, _ = get_table(database, df, modal=False) + _, missingData, _ = get_table(database, df.get(), modal=False) dataframe = pd.read_html(io.StringIO(missingData)) report_excel.set(add_to_report(report_choices, report_excel, [dataframe[0]], [], "missingdata")) selection.set(selection.get() + (f"{list(report_choices.get().keys())[-1]}",)) @@ -801,7 +806,7 @@ def show_missing_data_report(): @render.ui @reactive.event(input.save_modal_completeness) def save_dataframe_image(): - _, _, fig = get_table(database, df, dpi=dpi.get(), modal=False) + _, _, fig = get_table(database, df.get(), dpi=dpi.get(), modal=False) fig.write_image(completeness_table_image_path) return ui.notification_show(f"✅ Missing data image saved into {completeness_table_image_path}", duration=5, close_button=False) @@ -868,7 +873,7 @@ def indicator_types_ui_all(): @reactive.calc def filters(): - return get_filters(df) + return get_filters(df.get()) with ui.layout_sidebar(fillable=False, fill=False): # Sidebar for data import options @@ -1060,7 +1065,7 @@ def loading_modal(): return ui.HTML(str(modal) + js) ui.modal_show(loading_modal()) try: - result = get_main_informations(df) + result = get_main_informations(df.get()) return result finally: ui.modal_remove() @@ -1174,7 +1179,7 @@ def table_informations(): data['Average_Citations_per_Doc'][0] ] }) - return ui.HTML(DT(df_box, style="width=100%;")) + return ui.HTML(DT(df_box, style="width:100%;")) # --- Annual Scientific Production Section --- with ui.nav_panel("None", value="annual_scientific_production"): @@ -1215,7 +1220,7 @@ def show_annual_production_report(): with ui.card(full_screen=True): @reactive.calc def annual_informations(): - return get_annual_production(df) + return get_annual_production(df.get()) with ui.navset_underline(id="annual_tab"): with ui.nav_panel("Plot"): @@ -1228,7 +1233,7 @@ def show_annual_production(): @render.ui def table_annual_production(): _, publications_per_year = annual_informations() - return ui.HTML(DT(publications_per_year, style="width=100%;")) + return ui.HTML(DT(publications_per_year, style="width:100%;")) # AI bot Gemini Chat Integration # --- Floating Chat Button --- @@ -1369,7 +1374,7 @@ def show_average_citations_report(): with ui.card(full_screen=True): @reactive.calc def average_citations(): - return get_average_citations(df) + return get_average_citations(df.get()) with ui.navset_underline(id="average_tab"): with ui.nav_panel("Plot"): @@ -1382,7 +1387,7 @@ def show_average_citations(): @render.ui def table_average_citations(): _, avg_citations = average_citations() - return ui.HTML(DT(avg_citations, style="width=100%;")) + return ui.HTML(DT(avg_citations, style="width:100%;")) # --- Three-Field Plot Section --- with ui.nav_panel("None", value="three_field_plot"): @@ -1467,7 +1472,7 @@ def calculate_three_field_plot(): middle_field_items = input.middle_field_items() right_field_items = input.right_field_items() - result = get_three_field_plot(df, left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items) + result = get_three_field_plot(df.get(),left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items) three_field_plot_results.set(result) finally: ui.modal_remove() @@ -1601,7 +1606,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: num_of_sources = input.num_of_sources() - result = get_relevant_sources(df, num_of_sources) + result = get_relevant_sources(df.get(),num_of_sources) relevant_sources_results.set(result) finally: ui.modal_remove() @@ -1636,7 +1641,7 @@ def table_relevant_sources(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, relevant_sources_tab = result - return ui.HTML(DT(relevant_sources_tab, style="width=100%;")) + return ui.HTML(DT(relevant_sources_tab, style="width:100%;")) # --- Most Local Cited Sources Section --- with ui.nav_panel("None", value="most_local_cited_sources"): @@ -1745,7 +1750,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: num_of_cited_sources = input.num_of_cited_sources() - result = get_local_cited_sources(df, num_of_cited_sources) + result = get_local_cited_sources(df.get(),num_of_cited_sources) local_cited_sources_results.set(result) finally: ui.modal_remove() @@ -1780,7 +1785,7 @@ def table_local_cited_sources(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_sources_tab = result - return ui.HTML(DT(local_cited_sources_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_sources_tab, style="width:100%;")) # --- Bradford's Law Section --- with ui.nav_panel("None", value="bradfords_law"): @@ -1821,7 +1826,7 @@ def show_bradfords_law_report(): with ui.card(full_screen=True): @reactive.calc def bradford_law(): - return get_bradford_law(df) + return get_bradford_law(df.get()) with ui.navset_underline(id="bradford_law_tab"): with ui.nav_panel("Plot"): @@ -1834,7 +1839,7 @@ def show_bradford_law(): @render.ui def table_bradford_law(): _, bradford_law_tab = bradford_law() - return ui.HTML(DT(bradford_law_tab, style="width=100%;")) + return ui.HTML(DT(bradford_law_tab, style="width:100%;")) # --- Sources' Local Impact Section --- with ui.nav_panel("None", value="sources_local_impact"): @@ -1945,7 +1950,7 @@ def loading_modal(): try: num_of_sources_local_impact = input.num_of_sources_local_impact() source_local_impact = input.source_local_impact() - result = get_sources_local_impact(df, num_of_sources_local_impact, source_local_impact) + result = get_sources_local_impact(df.get(),num_of_sources_local_impact, source_local_impact) sources_local_impact_results.set(result) finally: ui.modal_remove() @@ -1980,7 +1985,7 @@ def table_sources_local_impact(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, sources_local_impact_tab = result - return ui.HTML(DT(sources_local_impact_tab, style="width=100%;")) + return ui.HTML(DT(sources_local_impact_tab, style="width:100%;")) # --- Sources' Production --- with ui.nav_panel("None", value="sources_production"): @@ -2080,7 +2085,7 @@ def loading_modal(): try: num_of_sources_production = input.num_of_sources_production() occurences = input.occurences() - result = get_sources_production(df, num_of_sources_production, occurences) + result = get_sources_production(df.get(),num_of_sources_production, occurences) sources_production_result.set(result) finally: ui.modal_remove() @@ -2126,7 +2131,7 @@ def table_sources_production(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, sources_production_tab = result - return ui.HTML(DT(sources_production_tab, style="width=100%;")) + return ui.HTML(DT(sources_production_tab, style="width:100%;")) # --- Most Relevant Authors Section --- with ui.nav_panel("None", value="most_relevant_authors"): @@ -2227,7 +2232,7 @@ def loading_modal(): try: num_of_authors = input.num_of_authors() frequency = input.frequency() - result = get_relevant_authors(df, num_of_authors, frequency) + result = get_relevant_authors(df.get(), num_of_authors, frequency) relevant_authors_result.set(result) finally: ui.modal_remove() @@ -2273,7 +2278,7 @@ def table_relevant_authors(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, relevant_authors_tab = result - return ui.HTML(DT(relevant_authors_tab, style="width=100%;")) + return ui.HTML(DT(relevant_authors_tab, style="width:100%;")) # --- Most Local Cited Authors Section --- with ui.nav_panel("None", value="most_local_cited_authors"): @@ -2376,7 +2381,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: num_of_cited_authors = input.num_of_cited_authors() - result = get_local_cited_authors(df, num_of_cited_authors) + result = get_local_cited_authors(df.get(),num_of_cited_authors) local_cited_authors_result.set(result) finally: ui.modal_remove() @@ -2421,7 +2426,7 @@ def table_local_cited_authors(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_authors_tab = result - return ui.HTML(DT(local_cited_authors_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_authors_tab, style="width:100%;")) # --- Authors' Production over Time Section --- with ui.nav_panel("None", value="authors_production"): @@ -2521,7 +2526,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_authors = input.TopAuthorsProdK() - result = get_author_production_over_time(df, top_k_authors) + result = get_author_production_over_time(df.get(),top_k_authors) au_over_time_result.set(result) finally: ui.modal_remove() @@ -2566,7 +2571,7 @@ def table_authors_production(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, table_authors_production, _ = result - return ui.HTML(DT(table_authors_production, style="width=100%;")) + return ui.HTML(DT(table_authors_production, style="width:100%;")) with ui.nav_panel("Table - Documents"): @render.ui @@ -2584,7 +2589,7 @@ def table_documents(): table_documents['DOI'] = table_documents['DOI'].apply( lambda x: f'{x}' if x != "N/A" else x ) - return ui.HTML(DT(table_documents, style="width=100%;")) + return ui.HTML(DT(table_documents, style="width:100%;")) # AI bot Gemini Chat Integration # --- Floating Chat Button --- @render.express() @@ -2723,7 +2728,7 @@ def show_lotkas_law_report(): with ui.card(full_screen=True): @reactive.calc def lotka_law(): - return get_lotka_law(df) + return get_lotka_law(df.get()) with ui.navset_underline(id="lotka_law_tab"): with ui.nav_panel("Plot"): @@ -2736,7 +2741,7 @@ def show_lotka_law(): @render.ui def table_lotka_law(): _, lotka_law_tab = lotka_law() - return ui.HTML(DT(lotka_law_tab, style="width=100%;")) + return ui.HTML(DT(lotka_law_tab, style="width:100%;")) # --- Authors' Local Impact Section --- with ui.nav_panel("None", value="authors_local_impact"): @@ -2837,7 +2842,7 @@ def loading_modal(): try: num_of_authors_local_impact = input.num_of_authors_local_impact() author_local_impact = input.author_local_impact() - result = get_authors_local_impact(df, num_of_authors_local_impact, author_local_impact) + result = get_authors_local_impact(df.get(),num_of_authors_local_impact, author_local_impact) authors_local_impact_result.set(result) finally: ui.modal_remove() @@ -2883,7 +2888,7 @@ def table_authors_local_impact(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, authors_local_impact_tab = result - return ui.HTML(DT(authors_local_impact_tab, style="width=100%;")) + return ui.HTML(DT(authors_local_impact_tab, style="width:100%;")) # --- Most Relevant Affiliations Section --- with ui.nav_panel("None", value="most_relevant_affiliations"): @@ -2984,7 +2989,7 @@ def loading_modal(): try: num_of_affiliations = input.num_of_affiliations() disambiguation = input.disambiguation() - result = get_relevant_affiliations(df, num_of_affiliations, disambiguation) + result = get_relevant_affiliations(df.get(),num_of_affiliations, disambiguation) relevant_affiliations_result.set(result) finally: ui.modal_remove() @@ -3030,7 +3035,7 @@ def table_relevant_affiliations(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, relevant_affiliations_tab = result - return ui.HTML(DT(relevant_affiliations_tab, style="width=100%;")) + return ui.HTML(DT(relevant_affiliations_tab, style="width:100%;")) # --- Affiliations' Production over Time Section --- with ui.nav_panel("None", value="affiliations_production"): @@ -3137,7 +3142,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_affiliations = input.TopAffProdK() - result = get_affiliation_production_over_time(df, top_k_affiliations) + result = get_affiliation_production_over_time(df.get(),top_k_affiliations) affiliations_production_results.set(result) finally: ui.modal_remove() @@ -3172,7 +3177,7 @@ def table_affiliations_production(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, table_affiliations_production = result - return ui.HTML(DT(table_affiliations_production, style="width=100%;")) + return ui.HTML(DT(table_affiliations_production, style="width:100%;")) # --- Affiliations' Local Impact Section --- with ui.nav_panel("None", value="corresponding_authors"): @@ -3281,7 +3286,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_countries = input.TopCountries() - result = get_corresponding_author_countries(df, top_k_countries) + result = get_corresponding_author_countries(df.get(),top_k_countries) corresponding_authors_results.set(result) finally: ui.modal_remove() @@ -3316,7 +3321,7 @@ def table_countries_collaboration(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, countries_table = result - return ui.HTML(DT(countries_table, style="width=100%;")) + return ui.HTML(DT(countries_table, style="width:100%;")) # --- Countries' Scientific Production Section --- with ui.nav_panel("None", value="countries_scientific_production"): @@ -3406,7 +3411,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: - result = get_countries_production(df) + result = get_countries_production(df.get()) return result finally: ui.modal_remove() @@ -3422,7 +3427,7 @@ def show_countries_production(): @render.ui def table_countries_production(): _, countries_table = countries_production() - return ui.HTML(DT(countries_table, style="width=100%;")) + return ui.HTML(DT(countries_table, style="width:100%;")) # --- Countries' Production over Time Section --- with ui.nav_panel("None", value="countries_production_over_time"): @@ -3531,7 +3536,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_countries = input.TopCountriesProdK() - result = get_countries_production_over_time(df, top_k_countries) + result = get_countries_production_over_time(df.get(),top_k_countries) countries_over_time_results.set(result) finally: ui.modal_remove() @@ -3566,7 +3571,7 @@ def table_countries_over_time(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, countries_table = result - return ui.HTML(DT(countries_table, style="width=100%;")) + return ui.HTML(DT(countries_table, style="width:100%;")) # --- Most Cited Countries Section --- with ui.nav_panel("None", value="most_cited_countries"): @@ -3677,7 +3682,7 @@ def loading_modal(): try: num_of_cited_countries = input.num_of_cited_countries() cited_countries_measure = input.cited_countries() - result = get_cited_countries(df, num_of_cited_countries, cited_countries_measure) + result = get_cited_countries(df.get(),num_of_cited_countries, cited_countries_measure) cited_countries_results.set(result) finally: ui.modal_remove() @@ -3712,7 +3717,7 @@ def table_cited_countries(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, cited_countries_tab = result - return ui.HTML(DT(cited_countries_tab, style="width=100%;")) + return ui.HTML(DT(cited_countries_tab, style="width:100%;")) # --- Most Global Cited Documents Section --- with ui.nav_panel("None", value="most_global_cited_documents"): @@ -3817,7 +3822,7 @@ def loading_modal(): try: num_of_cited_docs = input.num_of_cited_docs() cited_docs = input.cited_docs() - result = get_cited_documents(df, num_of_cited_docs, cited_docs) + result = get_cited_documents(df.get(),num_of_cited_docs, cited_docs) cited_documents_results.set(result) finally: ui.modal_remove() @@ -3852,7 +3857,7 @@ def table_cited_documents(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, cited_documents_tab = result - return ui.HTML(DT(cited_documents_tab, style="width=100%;")) + return ui.HTML(DT(cited_documents_tab, style="width:100%;")) # --- Most Local Cited Documents Section --- with ui.nav_panel("None", value="most_local_cited_documents"): @@ -3964,7 +3969,7 @@ def loading_modal(): # Run analysis num_of_local_cited_docs = input.num_of_local_cited_docs() field_separator = input.field_separator() - result = get_local_cited_documents(df, num_of_local_cited_docs, field_separator) + result = get_local_cited_documents(df.get(),num_of_local_cited_docs, field_separator) local_cited_documents_results.set(result) finally: ui.modal_remove() @@ -3998,7 +4003,7 @@ def table_local_cited_documents(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_documents_tab = result - return ui.HTML(DT(local_cited_documents_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_documents_tab, style="width:100%;")) # --- Most Local Cited References Section --- with ui.nav_panel("None", value="most_local_cited_references"): @@ -4110,7 +4115,7 @@ def loading_modal(): # Run analysis num_of_cited_refs = input.num_of_cited_refs() field_separator_ref = input.field_separator_ref() - result = get_local_cited_refs(df, num_of_cited_refs, field_separator_ref) + result = get_local_cited_refs(df.get(),num_of_cited_refs, field_separator_ref) local_cited_refs_results.set(result) finally: ui.modal_remove() @@ -4144,7 +4149,7 @@ def table_local_cited_refs(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_refs_tab = result - return ui.HTML(DT(local_cited_refs_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_refs_tab, style="width:100%;")) # --- References Spectroscopy Section --- with ui.nav_panel("None", value="references_spectroscopy"): @@ -4260,7 +4265,7 @@ def loading_modal(): start_year = input.start_year() end_year = input.end_year() field_separator_spec = input.field_separator_spec() - result = get_references_spectroscopy(df, start_year, end_year, field_separator_spec) + result = get_references_spectroscopy(df.get(),start_year, end_year, field_separator_spec) ref_spectroscopy_results.set(result) finally: ui.modal_remove() @@ -4294,7 +4299,7 @@ def table_references_rpy(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, ref_rpy_tab, _ = result - return ui.HTML(DT(ref_rpy_tab, style="width=100%;")) + return ui.HTML(DT(ref_rpy_tab, style="width:100%;")) with ui.nav_panel("Table - Cited References"): @render.ui @@ -4306,7 +4311,7 @@ def table_references_spectroscopy(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, _, ref_spectroscopy_tab = result - return ui.HTML(DT(ref_spectroscopy_tab, style="width=100%;")) + return ui.HTML(DT(ref_spectroscopy_tab, style="width:100%;")) # --- Most Frequent Words --- with ui.nav_panel("None", value="most_frequent_words"): @@ -4470,7 +4475,7 @@ def loading_modal(): file_upload_synonyms_mfw = None synonyms_data_mfw = None - result = get_frequent_words(df, ngram_mfw, num_of_words_mfw, field_mfw, file_upload_terms_mfw, file_upload_synonyms_mfw) + result = get_frequent_words(df.get(),ngram_mfw, num_of_words_mfw, field_mfw, file_upload_terms_mfw, file_upload_synonyms_mfw) frequent_words_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -4524,7 +4529,7 @@ def table_frequent_words(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, frequent_words_tab = result - return ui.HTML(DT(frequent_words_tab, style="width=100%;")) + return ui.HTML(DT(frequent_words_tab, style="width:100%;")) # --- WordCloud Section --- with ui.nav_panel("None", value="wordcloud"): @@ -4688,7 +4693,7 @@ def loading_modal(): file_upload_synonyms_wc = None synonyms_data_wc = None - result = get_wordcloud(df, ngram_wc, num_of_words_wc, field_wc, file_upload_terms_wc, file_upload_synonyms_wc) + result = get_wordcloud(df.get(),ngram_wc, num_of_words_wc, field_wc, file_upload_terms_wc, file_upload_synonyms_wc) wordcloud_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -4742,7 +4747,7 @@ def table_wordcloud(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, wordcloud_tab = result - return ui.HTML(DT(wordcloud_tab, style="width=100%;")) + return ui.HTML(DT(wordcloud_tab, style="width:100%;")) # --- TreeMap Section --- with ui.nav_panel("None", value="treemap"): @@ -4906,7 +4911,7 @@ def loading_modal(): file_upload_synonyms_tm = None synonyms_data_tm = None - result = get_treemap(df, ngram_tm, num_of_words_tm, field_tm, file_upload_terms_tm, file_upload_synonyms_tm) + result = get_treemap(df.get(),ngram_tm, num_of_words_tm, field_tm, file_upload_terms_tm, file_upload_synonyms_tm) treemap_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -4960,7 +4965,7 @@ def table_treemap(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, treemap_tab = result - return ui.HTML(DT(treemap_tab, style="width=100%;")) + return ui.HTML(DT(treemap_tab, style="width:100%;")) # --- References Spectroscopy Section --- with ui.nav_panel("None", value="words_frequency_over_time"): @@ -5127,7 +5132,7 @@ def loading_modal(): file_upload_synonyms_wf = None synonyms_data_wf = None - result = get_word_frequency(df, ngram_wf, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words) + result = get_word_frequency(df.get(),ngram_wf, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words) word_frequency_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -5357,7 +5362,7 @@ def loading_modal(): word_mimimum_frequency = input.word_mimimum_frequency() number_of_words_year = input.number_of_words_year() - result = get_trend_topics(df, ngram_tt, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_mimimum_frequency, number_of_words_year) + result = get_trend_topics(df.get(),ngram_tt, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_mimimum_frequency, number_of_words_year) trend_topics_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -5561,7 +5566,7 @@ def loading_modal(): community_repulsion = input.community_repulsion() clustering_algorithm = input.clustering_algorithm() - result = get_clustering_coupling(df, unit_of_analysis, coupling_field, stemmer, impact_measure, cluster_labeling, ngram, num_of_units, min_cluster_freq, label_per_cluster, label_size, community_repulsion, clustering_algorithm) + result = get_clustering_coupling(df.get(),unit_of_analysis, coupling_field, stemmer, impact_measure, cluster_labeling, ngram, num_of_units, min_cluster_freq, label_per_cluster, label_size, community_repulsion, clustering_algorithm) clustering_coupling_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -5848,7 +5853,7 @@ def loading_modal(): modal_content.append(ui.markdown("""

Synonyms to Remove

""")) modal_content.append(ui.HTML(DT(synonyms_data))) - result = get_co_occurence_network(df, field_cn, ngram_cn, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes, + result = get_co_occurence_network(df.get(),field_cn, ngram_cn, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes, repulsion_force, remove_isolated, min_edges, node_opacity, num_of_labels, node_shape, label_size_ls, edge_size, node_shadow, edit_nodes, label_cex, file_upload_terms, file_upload_synonyms) co_occurrence_network_results.set(result) @@ -5895,7 +5900,7 @@ def table_co_occurrence_network(): result = co_occurrence_network_results.get() if result is not None: _, _, co_occurrence_network_tab, _ = result - return ui.HTML(DT(co_occurrence_network_tab, style="width=100%;")) + return ui.HTML(DT(co_occurrence_network_tab, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run co-occurrence network", style="text-align: center; color: #999; font-size: 16px;"), @@ -6068,7 +6073,7 @@ def loading_modal(): cluster = input.thematic_clustering() repulsion = input.thematic_repulsion() - result = get_thematic_map(df, field, n, minfreq, ngram, stemming, + result = get_thematic_map(df.get(),field, n, minfreq, ngram, stemming, label_size, n_labels, repulsion, cluster) thematic_map_results.set(result) except Exception as e: @@ -6116,7 +6121,7 @@ def table_thematic_map(): result = thematic_map_results.get() if result is not None: _, _, thematic_map_table, _, _ = result - return ui.HTML(DT(thematic_map_table, style="width=100%;")) + return ui.HTML(DT(thematic_map_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"), @@ -6129,7 +6134,7 @@ def clusters_thematic_map(): result = thematic_map_results.get() if result is not None: _, _, _, thematic_map_cluster, _ = result - return ui.HTML(DT(thematic_map_cluster, style="width=100%;")) + return ui.HTML(DT(thematic_map_cluster, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"), @@ -6142,7 +6147,7 @@ def documents_thematic_map(): result = thematic_map_results.get() if result is not None: _, _, _, _, thematic_map_documents = result - return ui.HTML(DT(thematic_map_documents, maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(thematic_map_documents, maxBytes="10MB", style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"), @@ -6403,7 +6408,7 @@ def loading_modal(): ngrams = input.thematic_evolution_ngram() if field in ["TI", "AB"] else 1 stemming = input.thematic_evolution_stemmer() if field in ["TI", "AB"] else False - result = get_thematic_evolution(df, field, years, n, weight_index, min_weight_index, minfreq, label_size, ngrams, stemming, n_labels, overlap, remove_terms, synonyms, cluster) + result = get_thematic_evolution(df.get(),field, years, n, weight_index, min_weight_index, minfreq, label_size, ngrams, stemming, n_labels, overlap, remove_terms, synonyms, cluster) thematic_evolution_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -6444,7 +6449,7 @@ def table_thematic_evolution(): result = thematic_evolution_results.get() if result is not None: _, thematic_evolution_table, _ = result - return ui.HTML(DT(thematic_evolution_table, style="width=100%;")) + return ui.HTML(DT(thematic_evolution_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), @@ -6483,7 +6488,7 @@ def table_thematic_evolution_2(): if result is not None: _, _, TM = result if len(TM) > 0: - return ui.HTML(DT(TM[0]["words"], style="width=100%;")) + return ui.HTML(DT(TM[0]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6496,7 +6501,7 @@ def clusters_thematic_evolution_2(): if result is not None: _, _, TM = result if len(TM) > 0: - return ui.HTML(DT(TM[0]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[0]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6509,7 +6514,7 @@ def documents_thematic_evolution_2(): if result is not None: _, _, TM = result if len(TM) > 0: - return ui.HTML(DT(TM[0]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[0]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6547,7 +6552,7 @@ def table_thematic_evolution_3(): if result is not None: _, _, TM = result if len(TM) > 1: - return ui.HTML(DT(TM[1]["words"], style="width=100%;")) + return ui.HTML(DT(TM[1]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6560,7 +6565,7 @@ def clusters_thematic_evolution_3(): if result is not None: _, _, TM = result if len(TM) > 1: - return ui.HTML(DT(TM[1]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[1]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6573,7 +6578,7 @@ def documents_thematic_evolution_3(): if result is not None: _, _, TM = result if len(TM) > 1: - return ui.HTML(DT(TM[1]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[1]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6611,7 +6616,7 @@ def table_thematic_evolution_4(): if result is not None: _, _, TM = result if len(TM) > 2: - return ui.HTML(DT(TM[2]["words"], style="width=100%;")) + return ui.HTML(DT(TM[2]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6624,7 +6629,7 @@ def clusters_thematic_evolution_4(): if result is not None: _, _, TM = result if len(TM) > 2: - return ui.HTML(DT(TM[2]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[2]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6637,7 +6642,7 @@ def documents_thematic_evolution_4(): if result is not None: _, _, TM = result if len(TM) > 2: - return ui.HTML(DT(TM[2]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[2]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6675,7 +6680,7 @@ def table_thematic_evolution_5(): if result is not None: _, _, TM = result if len(TM) > 3: - return ui.HTML(DT(TM[3]["words"], style="width=100%;")) + return ui.HTML(DT(TM[3]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6688,7 +6693,7 @@ def clusters_thematic_evolution_5(): if result is not None: _, _, TM = result if len(TM) > 3: - return ui.HTML(DT(TM[3]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[3]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6701,7 +6706,7 @@ def documents_thematic_evolution_5(): if result is not None: _, _, TM = result if len(TM) > 3: - return ui.HTML(DT(TM[3]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[3]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6739,7 +6744,7 @@ def table_thematic_evolution_6(): if result is not None: _, _, TM = result if len(TM) > 4: - return ui.HTML(DT(TM[4]["words"]), style="width=100%;") + return ui.HTML(DT(TM[4]["words"]), style="width:100%;") return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6752,7 +6757,7 @@ def clusters_thematic_evolution_6(): if result is not None: _, _, TM = result if len(TM) > 4: - return ui.HTML(DT(TM[4]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[4]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6765,7 +6770,7 @@ def documents_thematic_evolution_6(): if result is not None: _, _, TM = result if len(TM) > 4: - return ui.HTML(DT(TM[4]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[4]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6995,7 +7000,7 @@ def loading_modal(): labelsize=input.wordmap_labelsize() size=input.wordmap_dot_size() - result = get_factorial_analysis(df, ngram, field, terms_data_wm, synonyms_data_wm, n_terms, n_clusters, num_documents, method, dimX, dimY, topWordPlot, threshold, labelsize, size) + result = get_factorial_analysis(df.get(),ngram, field, terms_data_wm, synonyms_data_wm, n_terms, n_clusters, num_documents, method, dimX, dimY, topWordPlot, threshold, labelsize, size) factorial_analysis_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -7051,7 +7056,7 @@ def show_words_by_cluster(): result = factorial_analysis_results.get() if result is not None: _, _, words_by_cluster, _ = result - return ui.HTML(DT(words_by_cluster, style="width=100%;")) + return ui.HTML(DT(words_by_cluster, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run factorial analysis", style="text-align: center; color: #999; font-size: 16px;"), @@ -7064,7 +7069,7 @@ def show_articles_by_cluster(): result = factorial_analysis_results.get() if result is not None: _, _, _, articles_by_cluster = result - return ui.HTML(DT(articles_by_cluster, style="width=100%;")) + return ui.HTML(DT(articles_by_cluster, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run factorial analysis", style="text-align: center; color: #999; font-size: 16px;"), @@ -7345,7 +7350,7 @@ def show_cocitation_table(): result = co_citation_network_results.get() if result is not None: _, _, cocit_table, _ = result - return ui.HTML(DT(cocit_table, style="width=100%;")) + return ui.HTML(DT(cocit_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the co-citation table.", style="text-align: center; color: #666; font-size: 16px;"), @@ -7560,7 +7565,7 @@ def show_hist_table(): result = historiograph_results.get() if result is not None: _, hist_tab, _ = result - return ui.HTML(DT(hist_tab, style="width=100%;")) + return ui.HTML(DT(hist_tab, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the historiograph table.", style="text-align: center; color: #666; font-size: 16px;"), @@ -7865,7 +7870,7 @@ def show_collaboration_table(): result = collaboration_network_results.get() if result is not None: _, _, collab_table, _ = result - return ui.HTML(DT(collab_table, style="width=100%;")) + return ui.HTML(DT(collab_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the collaboration table.", style="text-align: center; color: #666; font-size: 16px;"), @@ -8045,7 +8050,7 @@ def show_world_map_collaboration_table(): result = countries_collaboration_network_results.get() if result is not None: _, world_map_table = result - return ui.HTML(DT(world_map_table, style="width=100%;")) + return ui.HTML(DT(world_map_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the world map collaboration table.", style="text-align: center; color: #666; font-size: 16px;"), diff --git a/functions/__init__.py b/functions/__init__.py index 9a16e7bea..f778f9d49 100644 --- a/functions/__init__.py +++ b/functions/__init__.py @@ -1,2 +1,43 @@ -# Selective imports to avoid loading heavy dependencies automatically. -# Individual modules can still be imported directly when needed. \ No newline at end of file +from .get_affiliationproductionovertime import get_affiliation_production_over_time +from .get_annualproduction import get_annual_production +from .get_authorlocalimpact import get_authors_local_impact +from .get_authorproductionovertime import get_author_production_over_time +from .get_averagecitations import get_average_citations +from .get_bradfordlaw import get_bradford_law +from .get_citedcountries import get_cited_countries +from .get_citeddocuments import get_cited_documents +from .get_clusteringcoupling import get_clustering_coupling +from .get_cocitation import get_co_citation +from .get_collaborationnetwork import get_collaboration_network +from .get_correspondingauthorcountries import get_corresponding_author_countries +from .get_countriesproduction import get_countries_production +from .get_countriesproductionovertime import get_countries_production_over_time +from .get_co_occurence_network import get_co_occurence_network +from .get_data import get_data +from .get_database import get_database +from .get_factorialanalysis import get_factorial_analysis +from .get_filters import get_filters +from .get_frequentwords import get_frequent_words +from .get_historiograph import get_historiograph +from .get_localcitedauthors import get_local_cited_authors +from .get_localciteddocuments import get_local_cited_documents +from .get_localcitedreferences import get_local_cited_refs +from .get_localcitedsources import get_local_cited_sources +from .get_lotkalaw import get_lotka_law +from .get_maininformations import get_main_informations +from .get_referencesspectroscopy import get_references_spectroscopy +from .get_relevantaffiliations import get_relevant_affiliations +from .get_relevantauthors import get_relevant_authors +from .get_relevantsources import get_relevant_sources +from .get_sourceslocalimpact import get_sources_local_impact +from .get_sourcesproduction import get_sources_production +from .get_status import get_status +from .get_table import get_table +from .get_thematicevolution import get_thematic_evolution +from .get_thematicmap import get_thematic_map +from .get_threefieldplot import get_three_field_plot +from .get_treemap import get_treemap +from .get_trendtopics import get_trend_topics +from .get_wordcloud import get_wordcloud +from .get_wordfrequency import get_word_frequency +from .get_worldmapcollaboration import get_world_map_collaboration \ No newline at end of file diff --git a/functions/get_affiliationproductionovertime.py b/functions/get_affiliationproductionovertime.py index b9a281d8f..241c54855 100644 --- a/functions/get_affiliationproductionovertime.py +++ b/functions/get_affiliationproductionovertime.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_annualproduction.py b/functions/get_annualproduction.py index 3eabe932e..cf0416a08 100644 --- a/functions/get_annualproduction.py +++ b/functions/get_annualproduction.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_authorlocalimpact.py b/functions/get_authorlocalimpact.py index 49341daee..6b98005b0 100644 --- a/functions/get_authorlocalimpact.py +++ b/functions/get_authorlocalimpact.py @@ -1,7 +1,15 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +import numpy as np def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impact): diff --git a/functions/get_authorproductionovertime.py b/functions/get_authorproductionovertime.py index d0d7d177c..ca998725a 100644 --- a/functions/get_authorproductionovertime.py +++ b/functions/get_authorproductionovertime.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_averagecitations.py b/functions/get_averagecitations.py index fc5905d80..f4966ccbe 100644 --- a/functions/get_averagecitations.py +++ b/functions/get_averagecitations.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_bradfordlaw.py b/functions/get_bradfordlaw.py index 04303b4de..c609bc6d7 100644 --- a/functions/get_bradfordlaw.py +++ b/functions/get_bradfordlaw.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go import numpy as np diff --git a/functions/get_citedcountries.py b/functions/get_citedcountries.py index 13fffff31..418a4e185 100644 --- a/functions/get_citedcountries.py +++ b/functions/get_citedcountries.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_citeddocuments.py b/functions/get_citeddocuments.py index 56ef36cd1..3c2da055a 100644 --- a/functions/get_citeddocuments.py +++ b/functions/get_citeddocuments.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_clusteringcoupling.py b/functions/get_clusteringcoupling.py index 6ed04b8ae..e584a933b 100644 --- a/functions/get_clusteringcoupling.py +++ b/functions/get_clusteringcoupling.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_co_occurence_network.py b/functions/get_co_occurence_network.py index 09630fe54..32bfb3590 100644 --- a/functions/get_co_occurence_network.py +++ b/functions/get_co_occurence_network.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_cocitation.py b/functions/get_cocitation.py index b4ea191b6..d9bf2f2b1 100644 --- a/functions/get_cocitation.py +++ b/functions/get_cocitation.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py index a88b14d11..bad0f870f 100644 --- a/functions/get_collaborationnetwork.py +++ b/functions/get_collaborationnetwork.py @@ -1,6 +1,13 @@ from www.services import * import json import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_correspondingauthorcountries.py b/functions/get_correspondingauthorcountries.py index 774a93a77..ac8a9645f 100644 --- a/functions/get_correspondingauthorcountries.py +++ b/functions/get_correspondingauthorcountries.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_countriesproduction.py b/functions/get_countriesproduction.py index a9550a9e8..622d1d78a 100644 --- a/functions/get_countriesproduction.py +++ b/functions/get_countriesproduction.py @@ -1,8 +1,20 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import geopandas as gpd def get_countries_production(df): diff --git a/functions/get_countriesproductionovertime.py b/functions/get_countriesproductionovertime.py index 662733890..587ebfb4f 100644 --- a/functions/get_countriesproductionovertime.py +++ b/functions/get_countriesproductionovertime.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_data.py b/functions/get_data.py index a50e9aa2b..9e00ec3c5 100644 --- a/functions/get_data.py +++ b/functions/get_data.py @@ -1,7 +1,18 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from shiny.express import ui +from www.services.format_functions import biblio_json, process_multiple_files +from io import StringIO +from shiny.types import FileInfo def get_data(input, database, df, reset_callback=None): @@ -40,7 +51,7 @@ def get_data(input, database, df, reset_callback=None): text = ui.p( f"{database}'s files uploaded and processed successfully! " f"{len(file)} files have been processed and combined. " - f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." + f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." ) else: # Process single file (original logic) @@ -55,12 +66,12 @@ def get_data(input, database, df, reset_callback=None): text = ui.p( f"{database}'s ZIP archive uploaded and extracted successfully! " f"Multiple files have been processed and combined. " - f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." + f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." ) else: text = ui.p( f"{database}'s file uploaded successfully! You can now proceed to analyze your data. " - f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." + f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." ) except Exception as e: text = ui.div( @@ -76,7 +87,7 @@ def get_data(input, database, df, reset_callback=None): reset_callback() text = ui.p( f"{database}'s file uploaded successfully! You can now proceed to analyze your data. " - f"The dataset contains {df.copy().shape[0]} rows and {df.copy().shape[1]} columns." + f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." ) else: diff --git a/functions/get_database.py b/functions/get_database.py index 37a847fc0..733264d9e 100644 --- a/functions/get_database.py +++ b/functions/get_database.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_factorialanalysis.py b/functions/get_factorialanalysis.py index 1176e5c68..62dad8290 100644 --- a/functions/get_factorialanalysis.py +++ b/functions/get_factorialanalysis.py @@ -1,8 +1,16 @@ from www.services import * from scipy.spatial import ConvexHull, QhullError import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +import numpy as np def distance_to_y(dist, max_dist, scale_factor): norm = math.log1p(dist) / math.log1p(max_dist) diff --git a/functions/get_filters.py b/functions/get_filters.py index 43ee1deeb..83a1b7941 100644 --- a/functions/get_filters.py +++ b/functions/get_filters.py @@ -1,6 +1,13 @@ from www.services import * from functions.get_table import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_frequentwords.py b/functions/get_frequentwords.py index 08a7d4d82..31f85090f 100644 --- a/functions/get_frequentwords.py +++ b/functions/get_frequentwords.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go from collections import Counter diff --git a/functions/get_historiograph.py b/functions/get_historiograph.py index 511ab2fc9..bb921d16f 100644 --- a/functions/get_historiograph.py +++ b/functions/get_historiograph.py @@ -2,6 +2,13 @@ from pyvis.network import Network import tempfile import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go import networkx as nx diff --git a/functions/get_localcitedauthors.py b/functions/get_localcitedauthors.py index 2dc6aff78..79795e6fd 100644 --- a/functions/get_localcitedauthors.py +++ b/functions/get_localcitedauthors.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go @@ -71,13 +78,17 @@ def get_local_cited_authors(df, num_of_cited_authors, fast_search=False): layer="below", ) + for col in author_counts.columns: + if author_counts[col].dtype in ['float64','int64']: + author_counts[col] = author_counts[col].fillna(0) + fig.add_trace( go.Scatter( x=author_counts[frequency], y=list(range(len(author_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (author_counts[frequency] / author_counts[frequency].max()), + size=(18 + 6 * (author_counts[frequency] / author_counts[frequency].max())).fillna(18), color=author_counts[frequency], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_localciteddocuments.py b/functions/get_localciteddocuments.py index 0a8d7c4d4..33c22bf06 100644 --- a/functions/get_localciteddocuments.py +++ b/functions/get_localciteddocuments.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_localcitedreferences.py b/functions/get_localcitedreferences.py index 03efd7b77..f2702858e 100644 --- a/functions/get_localcitedreferences.py +++ b/functions/get_localcitedreferences.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py index 74fe04864..7648dab61 100644 --- a/functions/get_localcitedsources.py +++ b/functions/get_localcitedsources.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_lotkalaw.py b/functions/get_lotkalaw.py index 4307f0903..778830f7f 100644 --- a/functions/get_lotkalaw.py +++ b/functions/get_lotkalaw.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go import numpy as np diff --git a/functions/get_maininformations.py b/functions/get_maininformations.py index befc35f20..764ff43e9 100644 --- a/functions/get_maininformations.py +++ b/functions/get_maininformations.py @@ -1,9 +1,14 @@ from www.services import * import time import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go -from www.services.metatagextraction import metaTagExtraction def get_main_informations(df, log=False): @@ -22,8 +27,8 @@ def get_main_informations(df, log=False): #### Min and Max Year #### start_time = time.time() # Calculate the minimum and maximum publication years - data["Min_Year"] = data["PY"].min() - data["Max_Year"] = data["PY"].max() + data["Min_Year"] = pd.to_numeric(data["PY"], errors="coerce").min() + data["Max_Year"] = pd.to_numeric(data["PY"], errors="coerce").max() print(f"Min and Max Year calculation time: {time.time() - start_time:.4f} seconds") #### Unique Sources #### @@ -103,8 +108,8 @@ def count_authors(entry): # Ensure the 'AU_CO' column exists if "AU_CO" not in data.columns: # Extract the required metadata - df = metaTagExtraction(df, "AU_CO") - data = df.copy() + df_temp = metaTagExtraction(df, "AU_CO") + data["AU_CO"] = df_temp["AU_CO"] # Calculate "Country_Count" with a vectorized function data["Country_Count"] = data["AU_CO"].apply(lambda x: len(set(x))) diff --git a/functions/get_referencesspectroscopy.py b/functions/get_referencesspectroscopy.py index e77bb129d..7ae07b8ed 100644 --- a/functions/get_referencesspectroscopy.py +++ b/functions/get_referencesspectroscopy.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_relevantaffiliations.py b/functions/get_relevantaffiliations.py index 67baef2be..c8e4e0d4a 100644 --- a/functions/get_relevantaffiliations.py +++ b/functions/get_relevantaffiliations.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_relevantauthors.py b/functions/get_relevantauthors.py index c154b9c2f..7b4b539cf 100644 --- a/functions/get_relevantauthors.py +++ b/functions/get_relevantauthors.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_relevantsources.py b/functions/get_relevantsources.py index 2498bedba..67b3e6fd2 100644 --- a/functions/get_relevantsources.py +++ b/functions/get_relevantsources.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_sourceslocalimpact.py b/functions/get_sourceslocalimpact.py index 9af522c33..644726b7a 100644 --- a/functions/get_sourceslocalimpact.py +++ b/functions/get_sourceslocalimpact.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_sourcesproduction.py b/functions/get_sourcesproduction.py index 5b933575b..cf3ca0cbf 100644 --- a/functions/get_sourcesproduction.py +++ b/functions/get_sourcesproduction.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go from www. services.cocmatrix import cocMatrix diff --git a/functions/get_status.py b/functions/get_status.py index 16c449ca7..5b717332e 100644 --- a/functions/get_status.py +++ b/functions/get_status.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_table.py b/functions/get_table.py index 9e32be5bf..170bc7e87 100644 --- a/functions/get_table.py +++ b/functions/get_table.py @@ -1,8 +1,19 @@ from www.services import * from functions.get_status import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from shiny.express import ui +from www.services.utils import ICONS +from itables import to_html_datatable as DT +from itables.javascript import JavascriptFunction # Function to create a Plotly table visualization for metadata completeness @@ -82,7 +93,11 @@ def get_table(database, df, dpi=300, filter=False, modal=True): A DataTable object if data is available, otherwise a message indicating no data. """ # Retrieve the data from the DataFrame - data = df.copy() + if df is None: + return None, None, None + data = df() if callable(df) else df.copy() + if data is None: + return None, None, None table_html = "" fig = None @@ -208,7 +223,7 @@ def get_table(database, df, dpi=300, filter=False, modal=True): # Return a DataTable object with the data and the HTML/Plotly tables return ui.HTML( DT( - df.copy(), + data, maxBytes="10MB", classes="display compact stripe", style="text-transform: uppercase; font-size: small; table-layout: auto;", diff --git a/functions/get_thematicevolution.py b/functions/get_thematicevolution.py index c1288b8bf..d11789b39 100644 --- a/functions/get_thematicevolution.py +++ b/functions/get_thematicevolution.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_thematicmap.py b/functions/get_thematicmap.py index d0d4bfd71..d5f0c05c5 100644 --- a/functions/get_thematicmap.py +++ b/functions/get_thematicmap.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_threefieldplot.py b/functions/get_threefieldplot.py index 7495cf1dd..2afbf1800 100644 --- a/functions/get_threefieldplot.py +++ b/functions/get_threefieldplot.py @@ -1,6 +1,13 @@ from www.services import * import textwrap import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_treemap.py b/functions/get_treemap.py index 3725292fd..49f972671 100644 --- a/functions/get_treemap.py +++ b/functions/get_treemap.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_trendtopics.py b/functions/get_trendtopics.py index 8d853553f..f7e3f8ac6 100644 --- a/functions/get_trendtopics.py +++ b/functions/get_trendtopics.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_wordcloud.py b/functions/get_wordcloud.py index 5aa3b0c6a..ee0f81242 100644 --- a/functions/get_wordcloud.py +++ b/functions/get_wordcloud.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_wordfrequency.py b/functions/get_wordfrequency.py index ca1dd76d3..8dfff17e1 100644 --- a/functions/get_wordfrequency.py +++ b/functions/get_wordfrequency.py @@ -1,5 +1,12 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go diff --git a/functions/get_worldmapcollaboration.py b/functions/get_worldmapcollaboration.py index c6319911b..8963471fb 100644 --- a/functions/get_worldmapcollaboration.py +++ b/functions/get_worldmapcollaboration.py @@ -1,6 +1,13 @@ from www.services import * import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go import geopandas as gpd From cc1f065511c74c1ade618a1bbcc9acf0cba339ec Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Thu, 11 Jun 2026 11:07:17 +0200 Subject: [PATCH 7/9] Fix all dashboard functions:imports, reactive.Value, NaN markers, collaboration network, world map, histograph, thematic map, co-occurrence, co-citation, clustering, factorial analysis --- functions/get_citedcountries.py | 2 +- functions/get_citeddocuments.py | 2 +- functions/get_clusteringcoupling.py | 6 ++ functions/get_co_occurence_network.py | 8 +- functions/get_cocitation.py | 13 ++- functions/get_collaborationnetwork.py | 59 +++++++++----- functions/get_factorialanalysis.py | 12 +++ functions/get_frequentwords.py | 2 +- functions/get_historiograph.py | 37 ++++++--- functions/get_localciteddocuments.py | 2 +- functions/get_localcitedreferences.py | 2 +- functions/get_localcitedsources.py | 2 +- functions/get_referencesspectroscopy.py | 2 + functions/get_relevantaffiliations.py | 2 +- functions/get_relevantauthors.py | 2 +- functions/get_relevantsources.py | 2 +- functions/get_thematicevolution.py | 14 +++- functions/get_thematicmap.py | 1 + functions/get_treemap.py | 4 +- functions/get_trendtopics.py | 7 +- functions/get_wordcloud.py | 10 ++- functions/get_wordfrequency.py | 2 +- functions/get_worldmapcollaboration.py | 12 ++- www/services/biblionetwork.py | 4 +- www/services/couplingmap.py | 36 ++++++--- www/services/histnetwork.py | 2 + www/services/histplot.py | 2 +- www/services/networkplot.py | 31 +++++-- www/services/termextraction.py | 5 +- www/services/thematicmap.py | 102 ++++++++++++++++-------- 30 files changed, 271 insertions(+), 116 deletions(-) diff --git a/functions/get_citedcountries.py b/functions/get_citedcountries.py index 418a4e185..9d1fd619e 100644 --- a/functions/get_citedcountries.py +++ b/functions/get_citedcountries.py @@ -78,7 +78,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure): y=list(range(n)), mode="markers+text", marker=dict( - size=18 + 6 * (x_values / x_values.max()), + size=(18 + 6 * (x_values / x_values.max())).fillna(18), color=x_values, colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_citeddocuments.py b/functions/get_citeddocuments.py index 3c2da055a..e6cf3c52c 100644 --- a/functions/get_citeddocuments.py +++ b/functions/get_citeddocuments.py @@ -84,7 +84,7 @@ def get_cited_documents(df, num_of_cited_docs, cited_docs_measure): y=y_vals, mode="markers+text", marker=dict( - size=18 + 6 * (tab[tab.columns[1]] / tab[tab.columns[1]].max()), + size=(18 + 6 * (tab[tab.columns[1]] / tab[tab.columns[1]].max())).fillna(18), color=tab[tab.columns[1]], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_clusteringcoupling.py b/functions/get_clusteringcoupling.py index e584a933b..864313484 100644 --- a/functions/get_clusteringcoupling.py +++ b/functions/get_clusteringcoupling.py @@ -9,6 +9,12 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from www.services.couplingmap import couplingMap, avoid_net_overlaps +import igraph as ig +from pyvis.network import Network +import tempfile +import os + def get_clustering_coupling(df, unit_of_analysis, coupling_measured, stemmer, impact_measure, diff --git a/functions/get_co_occurence_network.py b/functions/get_co_occurence_network.py index 32bfb3590..b313eee52 100644 --- a/functions/get_co_occurence_network.py +++ b/functions/get_co_occurence_network.py @@ -9,6 +9,12 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from www.services.networkplot import network_plot +from pyvis.network import Network +from www.services.couplingmap import avoid_net_overlaps +import tempfile +import os +import matplotlib.pyplot as plt def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes, @@ -146,7 +152,7 @@ def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_alg # Generate layout # Using default igraph layout - layout = cocnet['graph']['layout'] + layout = cocnet['graph'].layout_fruchterman_reingold() print("Layout:", layout) # Get coordinates from layout coords = np.array([[pos[0], pos[1]] for pos in layout]) diff --git a/functions/get_cocitation.py b/functions/get_cocitation.py index d9bf2f2b1..23ee7ab98 100644 --- a/functions/get_cocitation.py +++ b/functions/get_cocitation.py @@ -9,7 +9,12 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go - +from www.services.networkplot import network_plot +from pyvis.network import Network +from www.services.couplingmap import avoid_net_overlaps +import tempfile +import os +import json def get_co_citation( df, field, sep, cocit_network_layout, cocit_clustering_algorithm, cocit_repulsion, @@ -46,7 +51,9 @@ def get_co_citation( degree_plot (plotly.graph_objs.Figure): Degree distribution plot for network nodes. """ - M = df + M = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() + print("M type:", type(M)) + print("M columns:", M.columns.tolist() if isinstance(M, pd.DataFrame) else "NOT A DATAFRAME") # Prepare network and title based on field NetRefs = None @@ -105,7 +112,7 @@ def get_co_citation( b = np.random.randint(0, 255) cluster_colors[cluster_id] = f"rgba({r},{g},{b},0.7)" - layout = cocitnet['graph']['layout'] + layout = cocitnet['graph'].layout_fruchterman_reingold() coords = np.array([[pos[0], pos[1]] for pos in layout]) coords = coords / np.abs(coords).max() coords[:, 0] *= 1000 diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py index bad0f870f..0e61058fa 100644 --- a/functions/get_collaborationnetwork.py +++ b/functions/get_collaborationnetwork.py @@ -10,6 +10,11 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from www.services.networkplot import network_plot +from pyvis.network import Network +from www.services.couplingmap import avoid_net_overlaps +import tempfile +import os def get_collaboration_network( @@ -56,6 +61,9 @@ def get_collaboration_network( print("Generating collaboration network...") + print("isolates value:", isolates, type(isolates)) + df = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() + isolates = False M = df m = df.copy() NetRefs = None @@ -85,27 +93,34 @@ def get_collaboration_network( normalize = None if colnormalize == "none" else colnormalize # Prepare network plot - netplot = network_plot( - NetMatrix=NetRefs, - normalize=normalize, - Title=Title, - type=network_layout if network_layout != "worldmap" else "auto", - size_cex=True, - size=5, - remove_multiple=False, - edgesize=edgesize * 3, - labelsize=labelsize, - label_cex=label_cex, - curved=curved, - label_n=label_n, - edges_min=edges_min, - label_color=False, - remove_isolates=isolates, - alpha=opacity, - cluster=clustering_algorithm, - community_repulsion=repulsion / 2, - verbose=False - ) + try: + netplot = network_plot( + NetMatrix=NetRefs, + normalize=normalize, + Title=Title, + type=network_layout if network_layout != "worldmap" else "auto", + size_cex=True, + size=5, + remove_multiple=False, + edgesize=edgesize * 3, + labelsize=labelsize, + label_cex=label_cex, + curved=curved, + label_n=label_n, + edges_min=edges_min, + label_color=False, + remove_isolates=isolates, + alpha=opacity, + cluster=clustering_algorithm, + community_repulsion=repulsion / 2, + verbose=False + ) + if len(netplot['graph'].vs) == 0: + raise ValueError("Network is empty. Use a larger dataset or disable 'Remove isolated nodes.") + except Exception as e: + import traceback + traceback.print_exc() + raise # Visualization (HTML, density plot, cluster table, degree plot) net = Network(height="98vh", width="100%", notebook=True, cdn_resources="in_line") @@ -119,7 +134,7 @@ def get_collaboration_network( b = np.random.randint(0, 255) cluster_colors[cluster_id] = f"rgba({r},{g},{b},{opacity})" - layout = netplot['graph']['layout'] + layout = netplot['graph'].layout_fruchterman_reingold() coords = np.array([[pos[0], pos[1]] for pos in layout]) coords = coords / np.abs(coords).max() coords[:, 0] *= 1000 diff --git a/functions/get_factorialanalysis.py b/functions/get_factorialanalysis.py index 62dad8290..540ce6df7 100644 --- a/functions/get_factorialanalysis.py +++ b/functions/get_factorialanalysis.py @@ -11,6 +11,18 @@ import plotly.express as px import plotly.graph_objects as go import numpy as np +from www.services.tabletag import table_tag +from typing import Union, Optional, Sequence, Dict, List +import math +from pyvis.network import Network +import tempfile +import os +from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, to_tree +from scipy.spatial.distance import pdist +from prince import CA, MCA +from sklearn.manifold import MDS as SK_MDS +from sklearn.preprocessing import StandardScaler + def distance_to_y(dist, max_dist, scale_factor): norm = math.log1p(dist) / math.log1p(max_dist) diff --git a/functions/get_frequentwords.py b/functions/get_frequentwords.py index 31f85090f..74a0c37f3 100644 --- a/functions/get_frequentwords.py +++ b/functions/get_frequentwords.py @@ -120,7 +120,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): if tag in ['AB', 'TI']: text_data = term_extraction(df, field=tag, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - text_data = text_data.get() + text_data = text_data[f"{tag}_TM"] else: text_data = M[tag] diff --git a/functions/get_historiograph.py b/functions/get_historiograph.py index bb921d16f..1c6763542 100644 --- a/functions/get_historiograph.py +++ b/functions/get_historiograph.py @@ -14,7 +14,7 @@ import networkx as nx import os from matplotlib.colors import to_rgba - +from www.services.histplot import histPlot def hex_to_rgba(hex_color, alpha): if not isinstance(hex_color, str) or not hex_color.startswith("#") or len(hex_color) != 7: @@ -37,18 +37,24 @@ def get_historiograph(df, node_label="AU1", histNodes=20, hist_isolates=True, hi filename: nome del file HTML interattivo salvato temporaneamente """ # Pre-elaborazione + df = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() df = metaTagExtraction(df, "SR") hist_results = histNetwork(df, min_citations=0, sep=sep, network=True) # 1. Costruzione iniziale del grafo - hist_plot = histPlot( - hist_results, - n=histNodes, - size=histsize, - remove_isolates=False, # rimozione manuale - label=node_label, - verbose=False - ) + try: + hist_plot = histPlot( + hist_results, + n=histNodes, + size=histsize, + remove_isolates=False, # rimozione manuale + label=node_label, + verbose=False + ) + except Exception as e: + import traceback + traceback.print_exc() + raise # 2. Recupera layout e rete iniziale layout_df = pd.DataFrame(hist_plot["layout"]).copy() @@ -100,17 +106,24 @@ def get_historiograph(df, node_label="AU1", histNodes=20, hist_isolates=True, hi # Rimozione Year mancanti hist_data = hist_data[hist_data["Year"].notna()].copy() if hist_data.empty: - raise ValueError("Nessun dato con 'Year' valido per la historiograph.") + hist_data = hist_results["histData"].copy() + hist_data["Year"] = hist_data["Year"].fillna(0) # Posizionamento temporale orizzontale - hist_data = hist_data.sort_values(['cluster', 'Year']) + if 'cluster' in hist_data.columns: + hist_data = hist_data.sort_values(['cluster','Year']) + else: + hist_data = hist_data.sort_values(['Year']) min_year = hist_data["Year"].min() year_range = hist_data["Year"].max() - min_year + 1 # Spazio orizzontale compatto hist_data["x"] = (hist_data["Year"] - min_year) * 60 # invece di /year_range * 1000 # Spazio verticale più ravvicinato tra cluster - hist_data["y"] = hist_data["cluster"] * 150 + np.random.uniform(-30, 30, size=len(hist_data)) + if 'cluster' in hist_data.columns: + hist_data["y"] = hist_data["cluster"] * 150 + np.random.uniform(-30, 30, size=len(hist_data)) + else: + hist_data["y"] = np.random.uniform(-30, 30, size=len(hist_data)) # Tooltip e label robusti diff --git a/functions/get_localciteddocuments.py b/functions/get_localciteddocuments.py index 33c22bf06..442310d95 100644 --- a/functions/get_localciteddocuments.py +++ b/functions/get_localciteddocuments.py @@ -89,7 +89,7 @@ def get_local_cited_documents(df, num_of_local_cited_docs, field_separator, fast y=list(range(len(df_documents))), mode="markers+text", marker=dict( - size=18 + 6 * (df_documents["Local Citations"] / df_documents["Local Citations"].max()), + size=(18 + 6 * (df_documents["Local Citations"] / df_documents["Local Citations"].max())).fillna(18), color=df_documents["Local Citations"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_localcitedreferences.py b/functions/get_localcitedreferences.py index f2702858e..17e8919b1 100644 --- a/functions/get_localcitedreferences.py +++ b/functions/get_localcitedreferences.py @@ -73,7 +73,7 @@ def get_local_cited_refs(df, num_of_cited_refs, field_separator): y=list(range(len(source_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (source_counts["Citations"] / source_counts["Citations"].max()), + size=(18 + 6 * (source_counts["Citations"] / source_counts["Citations"].max())).fillna(18), color=source_counts["Citations"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py index 7648dab61..3dda4007a 100644 --- a/functions/get_localcitedsources.py +++ b/functions/get_localcitedsources.py @@ -65,7 +65,7 @@ def wrap_label(label, width=50): y=list(range(len(source_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (source_counts["N. of Local Citations"] / source_counts["N. of Local Citations"].max()), + size=(18 + 6 * (source_counts["N. of Local Citations"] / source_counts["N. of Local Citations"].max())).fillna(18), color=source_counts["N. of Local Citations"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_referencesspectroscopy.py b/functions/get_referencesspectroscopy.py index 7ae07b8ed..97bf15162 100644 --- a/functions/get_referencesspectroscopy.py +++ b/functions/get_referencesspectroscopy.py @@ -9,6 +9,8 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +import re +from plotly.subplots import make_subplots def get_references_spectroscopy(df, start_year, end_year=2005, field_separator_spec=';'): diff --git a/functions/get_relevantaffiliations.py b/functions/get_relevantaffiliations.py index c8e4e0d4a..20268d7e9 100644 --- a/functions/get_relevantaffiliations.py +++ b/functions/get_relevantaffiliations.py @@ -50,7 +50,7 @@ def get_relevant_affiliations(df, num_of_affiliations, disambiguation): y=list(range(len(affiliation_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (affiliation_counts["Articles"] / affiliation_counts["Articles"].max()), + size=(18 + 6 * (affiliation_counts["Articles"] / affiliation_counts["Articles"].max())).fillna(18), color=affiliation_counts["Articles"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_relevantauthors.py b/functions/get_relevantauthors.py index 7b4b539cf..a7be9453a 100644 --- a/functions/get_relevantauthors.py +++ b/functions/get_relevantauthors.py @@ -82,7 +82,7 @@ def get_relevant_authors(df, num_of_authors, frequency="N. of Documents"): y=list(range(len(author_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (author_counts[frequency] / author_counts[frequency].max()), + size=(18 + 6 * (author_counts[frequency] / author_counts[frequency].max())).fillna(18), color=author_counts[frequency], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_relevantsources.py b/functions/get_relevantsources.py index 67b3e6fd2..7199bba2e 100644 --- a/functions/get_relevantsources.py +++ b/functions/get_relevantsources.py @@ -66,7 +66,7 @@ def wrap_label(label, width=50): y=list(range(len(source_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (source_counts["N. of Documents"] / source_counts["N. of Documents"].max()), + size=(18 + 6 * (source_counts["N. of Documents"] / source_counts["N. of Documents"].max())).fillna(18), color=source_counts["N. of Documents"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_thematicevolution.py b/functions/get_thematicevolution.py index d11789b39..8c50c7e96 100644 --- a/functions/get_thematicevolution.py +++ b/functions/get_thematicevolution.py @@ -9,6 +9,13 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from pyvis.network import Network +import tempfile +import os +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +from matplotlib.colors import to_hex +from www.services.thematicmap import thematic_map def get_thematic_evolution(df, field="ID", years=None, n=250, weight_index="inc_index", min_weight_index=0.1, minFreq=2, @@ -104,7 +111,6 @@ def thematic_evolution(M, field="ID", years=None, n=250, min_freq=2, size=0.5, n for interval_label, Mk in list_df.items(): Y.append(f"{min(Mk['PY'])}-{max(Mk['PY'])}") - Mk = reactive.Value(Mk) resk_tuple = thematic_map( Mk, field=field, n=n, minfreq=min_freq, ngrams=ngrams, @@ -320,7 +326,7 @@ def timeslice(M, breaks=None, k=5): Returns: dict: Dictionary containing DataFrames for each sub-period. """ - M = M.get() + # Convert the 'PY' column to numeric M['PY'] = pd.to_numeric(M['PY'], errors='coerce') @@ -330,6 +336,9 @@ def timeslice(M, breaks=None, k=5): breaks = np.floor(np.linspace(M['PY'].min() - 1, M['PY'].max(), k + 1)) else: breaks = [M['PY'].min() - 1] + breaks + [M['PY'].max()] + breaks = sorted(list(set(breaks))) + if len(breaks) < 2: + raise ValueError("Not enough distinct break points for time sclicing.") # print("breaks:", breaks) @@ -342,6 +351,7 @@ def timeslice(M, breaks=None, k=5): # Split the DataFrame based on intervals split_df = {str(interval): M[M['interval'] == interval].drop(columns=['interval']) for interval in intervals} + split_df = {k: v for k,v in split_df.items() if len(v) > 0} return split_df diff --git a/functions/get_thematicmap.py b/functions/get_thematicmap.py index d5f0c05c5..009e185a0 100644 --- a/functions/get_thematicmap.py +++ b/functions/get_thematicmap.py @@ -9,6 +9,7 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from www.services.thematicmap import thematic_map def get_thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False): diff --git a/functions/get_treemap.py b/functions/get_treemap.py index 49f972671..1e2f9f8f7 100644 --- a/functions/get_treemap.py +++ b/functions/get_treemap.py @@ -9,6 +9,8 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from collections import Counter + def get_treemap(df, ngram, num_of_words, word_type, file_upload_terms, file_upload_synonyms, field_separator_frequent=';'): @@ -94,7 +96,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): if tag in ['AB', 'TI']: text_data = term_extraction(df, field=tag, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - text_data = text_data.get() + text_data = text_data[f"{tag}_TM"] else: text_data = M[tag] diff --git a/functions/get_trendtopics.py b/functions/get_trendtopics.py index f7e3f8ac6..00aa7f762 100644 --- a/functions/get_trendtopics.py +++ b/functions/get_trendtopics.py @@ -59,6 +59,8 @@ def get_trend_topics(df, ngram, field_tt, time_window, file_upload_terms_tt, fil # Get trend topics trend_topics = field_by_year(df, field, time_window, word_minimum_frequency, number_of_words_year, remove_terms, synonyms) + print(trend_topics.columns.tolist()) + print(trend_topics.head()) # Plot fig = px.scatter(trend_topics, x='year_med', y='item', size='freq', hover_data=['year_q1', 'year_q3'], height=800) @@ -115,7 +117,8 @@ def field_by_year(df, field, timespan, min_freq, n_items, remove_terms=None, syn trend_med = pd.DataFrame(A.values).apply(lambda x: pd.Series(np.round(np.quantile(np.repeat(df['PY'], x), [0.25, 0.5, 0.75]))), axis=0).T trend_med.columns = ['year_q1', 'year_med', 'year_q3'] trend_med['freq'] = n - trend_med['item'] = A.columns + trend_med['item'] = A.columns.tolist() + trend_med['year_med'] = trend_med['year_med'].astype(float) # Filter by timespan and frequency if timespan is None or len(timespan) != 2: @@ -123,6 +126,6 @@ def field_by_year(df, field, timespan, min_freq, n_items, remove_terms=None, syn trend_med = trend_med[(trend_med['year_med'] >= timespan[0]) & (trend_med['year_med'] <= timespan[1])] trend_med = trend_med[trend_med['freq'] >= min_freq] - trend_med = trend_med.groupby('year_med').apply(lambda x: x.nlargest(n_items, 'freq')).reset_index(drop=True) + trend_med = trend_med.sort_values('freq', ascending=False).groupby('year_med', group_keys=False).head(n_items).reset_index(drop=True) return trend_med diff --git a/functions/get_wordcloud.py b/functions/get_wordcloud.py index ee0f81242..2631ead1f 100644 --- a/functions/get_wordcloud.py +++ b/functions/get_wordcloud.py @@ -9,6 +9,14 @@ from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +from collections import Counter +import networkx as nx +import matplotlib.colors as mcolors +import random +import math +from pyvis.network import Network +import tempfile +import os def is_legible_on_white(color): @@ -125,7 +133,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): if tag in ['AB', 'TI']: text_data = term_extraction(df, field=tag, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - text_data = text_data.get() + text_data = text_data[f"{tag}_TM"] else: text_data = M[tag] diff --git a/functions/get_wordfrequency.py b/functions/get_wordfrequency.py index 8dfff17e1..4015e23c3 100644 --- a/functions/get_wordfrequency.py +++ b/functions/get_wordfrequency.py @@ -49,7 +49,7 @@ def get_word_frequency(df, ngram, field_wf, file_upload_terms_wf, file_upload_sy data = term_extraction(df, field=field_wf, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - data = data.get() + if field_wf == 'TI': print(data[f"{field_wf}_TM"]) diff --git a/functions/get_worldmapcollaboration.py b/functions/get_worldmapcollaboration.py index 8963471fb..41a103395 100644 --- a/functions/get_worldmapcollaboration.py +++ b/functions/get_worldmapcollaboration.py @@ -12,13 +12,15 @@ import plotly.graph_objects as go import geopandas as gpd import networkx as nx +import os def get_world_map_collaboration(df, edges_min=1, edgesize=5): # Estrai metadati dai paesi (assumi che tu abbia già AU_CO processato) - M = df - df = metaTagExtraction(df, "AU_CO") - df = df.copy() - + df = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() + if "AU_CO" not in df.columns: + df = metaTagExtraction(df, "AU_CO") + M = df.copy() + # Normalizza e conta le occorrenze dei paesi (come in get_countries_production) df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x]) df = df.explode("AU_CO") @@ -39,6 +41,8 @@ def clean_country_names(country): # Costruisci matrice di collaborazione net = biblionetwork(M, analysis="collaboration", network="countries") + if net is None or net.empty: + return go.FigureWidget(go.Figure()), pd.DataFrame(columns=['From', 'To', 'count']) net_df = pd.DataFrame(net) # Costruisci rete diff --git a/www/services/biblionetwork.py b/www/services/biblionetwork.py index 7e65b4880..8e0288300 100644 --- a/www/services/biblionetwork.py +++ b/www/services/biblionetwork.py @@ -71,11 +71,11 @@ def crossprod(A, B): filtered_index = [idx for idx in NetMatrix.index if str(idx).strip()] NetMatrix = NetMatrix.loc[filtered_index, filtered_columns] - M = M.get() # Estrai il dizionario se M è un oggetto + M = M.get() if hasattr(M, 'get') and callable(M.get) and not isinstance(M, pd.DataFrame) else M.copy() # Estrai il dizionario se M è un oggetto db_name = M["DB"].iloc[0] print(f"db_name: {db_name}") - if network == "references" and db_name == "SCOPUS": + if network == "references" and db_name.upper() == "SCOPUS": ind = [i for i, col in enumerate(NetMatrix.columns) if str(col)[0].isalpha()] NetMatrix = NetMatrix.iloc[ind, ind] diff --git a/www/services/couplingmap.py b/www/services/couplingmap.py index e06467898..f9e180001 100644 --- a/www/services/couplingmap.py +++ b/www/services/couplingmap.py @@ -79,22 +79,23 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, DC = DC.reset_index(drop=True) # Raggruppa senza ambiguità - df_lab = DC.groupby('group', as_index=False).apply(lambda x: x.assign( - MNLCS2=x['MNLCS'].where(x['MNLCS'] >= 1), - MNLCS=round(x['MNLCS'], 2), - name=x['name'].str.lower(), - freq=len(x) - )).sort_values(by=['MNLCS'], ascending=False) + df_lab = DC.copy() + df_lab['MNLCS2'] = df_lab['MNLCS'].where(df_lab['MNLCS'] >= 1) + df_lab['MNLCS'] = df_lab.groupby('group')['MNLCS'].transform(lambda x: round(x, 2)) + df_lab['name'] = df_lab['name'].str.lower() + df_lab['freq'] = df_lab.groupby('group')['name'].transform('count') + df_lab = df_lab.sort_values(by=['MNLCS'], ascending=False) df = df_lab.groupby('group').apply(lambda x: pd.Series({ 'freq': x['freq'].iloc[0], 'centrality': x['pagerank_centrality'].mean() * 100, 'impact': np.nan_to_num(x['MNLCS2'].mean(skipna=True)), - 'label_cluster': x['group'].iloc[0], + 'label_cluster': x.name, 'color': x['color'].iloc[0], 'label': '\n'.join(x['name'].iloc[:min(n_labels, len(x))].tolist()), 'words': '\n'.join((x['name'] + ' ' + x['MNLCS'].astype(str)).tolist()) })).reset_index() + df['rcentrality'] = df['centrality'].rank() df['rimpact'] = df['impact'].rank() @@ -104,12 +105,12 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, df = df[df['freq'] >= minfreq] df_lab = df_lab[df_lab['group'].isin(df['group'])] - df_lab = df_lab.iloc[:, [0, 6, 14, 7, 3]] + df_lab = df_lab[[analysis, 'cluster', 'freq', 'color', 'MNLCS']] df_lab.columns = [analysis, "Cluster", "ClusterFrequency", "ClusterColor", "NormalizedLocalCitationScore"] df_lab['ClusterName'] = df_lab['Cluster'].map(df.set_index('group')['label']) - M = M.drop(columns=['SR']).reset_index() + M = M.reset_index() if label_term is None: label_term = "null" @@ -314,7 +315,12 @@ def normalizeCitationScore(df, field="documents", impact_measure="local"): # Applica localCitations se richiesto if impact_measure == "local": - df = localCitations(df, fast_search=False, sep=";")['M'] + try: + df = localCitations(df, fast_search=False, sep=";")['M'] + except Exception as e: + import traceback + traceback.print_exc() + raise else: df['LCS'] = 0 @@ -435,7 +441,7 @@ def network(df, analysis, field, stemming, n, cluster, community_repulsion): def labeling(df, df_lab, term, n, n_labels, analysis, ngrams): # Se il termine è TI o AB, estrai termini if term in ["TI", "AB"]: - df = term_extraction(reactive.Value(df), field=term, ngrams=ngrams, verbose=False) + df = term_extraction(df, field=term, ngrams=ngrams, verbose=False) df = df.copy() term = f"{term}_TM" @@ -525,6 +531,14 @@ def localCitations(df, fast_search=False, sep=";"): loccit = 1 H = histNetwork(df, min_citations=loccit, sep=sep, network=False) + if H is None: + M['LCS'] = 0 + CR = { + 'Authors': pd.DataFrame(columns=["Authors", "N. of Local Citations"]), + 'Papers': pd.DataFrame(columns=["Paper", "DOI", "Year", "LCS", "GCS"]), + 'M': M + } + return CR LCS = H['histData'] M = H['M'] diff --git a/www/services/histnetwork.py b/www/services/histnetwork.py index dee14c14b..72b5591e9 100644 --- a/www/services/histnetwork.py +++ b/www/services/histnetwork.py @@ -19,6 +19,8 @@ def histNetwork(df, min_citations=0, sep=";", network=True): - M: A DataFrame containing the metadata of the papers with the Local Citation Score (LCS). - LCS: A list containing the Local Citation Score of each paper. """ + from .metatagextraction import metaTagExtraction + df = metaTagExtraction(df, "SR") M = df.copy() db = M['DB'][0] diff --git a/www/services/histplot.py b/www/services/histplot.py index fb5c472f7..d46f920d9 100644 --- a/www/services/histplot.py +++ b/www/services/histplot.py @@ -31,7 +31,7 @@ def histPlot(histResults, n=20, size=5, labelsize=5, remove_isolates=True, title # Selezioniamo il valore di soglia s sorted_LCS = LCS.sort_values(ascending=False) - s = sorted_LCS.iloc[min(n, len(sorted_LCS))] + s = sorted_LCS.iloc[min(n, len(sorted_LCS)) - 1] # Troviamo gli indici (etichette) che soddisfano la condizione LCS >= s selected_columns = sorted_LCS[sorted_LCS >= s].index.tolist() diff --git a/www/services/networkplot.py b/www/services/networkplot.py index 156cfbfd0..b9d545380 100644 --- a/www/services/networkplot.py +++ b/www/services/networkplot.py @@ -1,6 +1,6 @@ from .utils import * from .cocmatrix import * - +import builtins def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", type="auto", label=True, labelsize=1, label_cex=False, label_color=False, label_n=None, halo=False, @@ -22,6 +22,7 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t # Create igraph object bsk_network = ig.Graph.Weighted_Adjacency(NetMatrix.values.tolist(), mode=ig.ADJ_UNDIRECTED, attr="weight") bsk_network.vs["name"] = NetMatrix.columns + print(f"Nodes after creation: {len(bsk_network.vs)}") # Compute node degrees deg = np.array(bsk_network.degree()) @@ -60,11 +61,13 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t bsk_network.delete_vertices(indices_to_delete) if bsk_S is not None: bsk_S.delete_vertices(indices_to_delete) + print(f"Nodes after n filter: {len(bsk_network.vs)}") # Simplify the graph if edges_min > 1: remove_multiple = False bsk_network.simplify(multiple=remove_multiple, loops=noloops) + print(f"Nodes after simplify: {len(bsk_network.vs)}") if bsk_S is not None: bsk_S.simplify(multiple=remove_multiple, loops=noloops) @@ -98,6 +101,16 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t if bsk_S is not None: isolates_to_remove = [v.index for v in bsk_S.vs if v["name"] not in bsk_network.vs["name"]] bsk_S.delete_vertices(isolates_to_remove) + print(f"Nodes after remove_isolates: {len(bsk_network.vs)}") + + if len(bsk_network.vs) == 0: + print("Warning: Network is empty after filtering.") + return { + "S": None, + "graph": bsk_network, + "cluster_res": pd.DataFrame(), + "cluster_obj": builtins.type('obj', (object,), {'membership': []})() + } # Apply clustering cl = clustering_network(bsk_network, cluster) @@ -180,6 +193,7 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t else: cluster_res = None + return { "S": S, "graph": bsk_network, @@ -188,6 +202,8 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t } + + def delete_isolates(graph, mode='all'): isolates = [v.index for v in graph.vs if graph.degree(v, mode=mode) == 0] graph.delete_vertices(isolates) @@ -279,10 +295,15 @@ def switch_layout(bsk_network, type, community_repulsion): # Normalizza manualmente il layout l_coords = np.array(l.coords) - min_coords = l_coords.min(axis=0) - max_coords = l_coords.max(axis=0) - normalized_coords = (l_coords - min_coords) / (max_coords - min_coords) - l = ig.Layout(normalized_coords.tolist()) + if len(l_coords) == 0: + l = ig.Layout([[0,0]]) + else: + min_coords = l_coords.min(axis=0) + max_coords = l_coords.max(axis=0) + range_coords = max_coords - min_coords + range_coords[range_coords==0] = 1 + normalized_coords = (l_coords - min_coords) / range_coords + l = ig.Layout(normalized_coords.tolist()) return {"l": l, "bsk_network": bsk_network} diff --git a/www/services/termextraction.py b/www/services/termextraction.py index 95bd7040b..5ccc921b9 100644 --- a/www/services/termextraction.py +++ b/www/services/termextraction.py @@ -97,7 +97,4 @@ def term_extraction(df, field="TI", ngrams=1, stemming=False, language="english" if verbose: print(terms_df.sum().sort_values(ascending=False).head(25)) - # Finalize the output - df.set(M) - - return df + return M diff --git a/www/services/thematicmap.py b/www/services/thematicmap.py index 413e1e3c2..ab33123b1 100644 --- a/www/services/thematicmap.py +++ b/www/services/thematicmap.py @@ -2,6 +2,10 @@ from .igraph2vis import * from .termextraction import * from .biblionetwork import * +from pyvis.network import Network +import tempfile +import os +from www.services.couplingmap import avoid_net_overlaps def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False): @@ -78,17 +82,21 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz }) # Filter and process cluster data + print("df_lab before groupby:", df_lab.columns.tolist()) + print("df_lab shape:", df_lab.shape) df_lab = (df_lab[df_lab['sC'] >= minfreq] - .groupby('groups') + .groupby('groups', group_keys=False) .apply(lambda x: pd.Series({ 'freq': x['sC'].sum(), 'cluster_label': x.loc[x['sC'].idxmax(), 'words'], - 'sC': list(x['sC']), # Se necessario mantenere i valori di sC - 'words': ', '.join(x['words'].astype(str)), # <-- Converte in stringa pulita - 'color': x['color'].iloc[0] # Prende il primo valore della colonna + 'sC': list(x['sC']), + 'words': ', '.join(x['words'].astype(str)), + 'color': x['color'].iloc[0], + 'groups': x.name })) - .reset_index()) - + .reset_index(drop=True)) + print("df_lab columns:", df_lab.columns.tolist()) + print("df_lab head:", df_lab.head()) # Explode both words and sC columns to create rows for each word and its occurrence count df_lab = df_lab.assign( words=df_lab['words'].str.split(', '), @@ -117,16 +125,10 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz df_lab['words'] = df_lab['words'].astype(str) # Perform left joins equivalent to R's left_join operations - sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']], - left_on='words1', - right_on='words', - how='left') - sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']], - left_on='words2', - right_on='words', - how='left', - suffixes=('', '2')) - + sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']].rename(columns={'words': 'words1_match', 'groups': 'groups1'}), + left_on='words1', right_on='words1_match', how='left').drop(columns=['words1_match'], errors='ignore') + sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']].rename(columns={'words': 'words2_match', 'groups': 'groups2'}), + left_on='words2', right_on='words2_match', how='left').drop(columns=['words2_match'], errors='ignore') # Drop the extra 'words' columns created by the merge sEij_df = sEij_df.drop(['words', 'words_y'], axis=1, errors='ignore') @@ -135,6 +137,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz .groupby('groups') .first() .reset_index()) + print("df_lab_top columns:", df_lab_top.columns.tolist()) # Remove duplicate columns sEij_df = sEij_df.loc[:, ~sEij_df.columns.duplicated()] @@ -165,8 +168,8 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz # 3. Filtra correttamente i dati df = ( filtered_df - .assign(ext=lambda x: (x['groups'] != x['groups2']).astype(int)) - .groupby('groups') + .assign(ext=lambda x: (x['groups1'] != x['groups2']).astype(int)) + .groupby('groups1') .agg({ 'words1': lambda x: len(set(x)), 'eij': lambda x: sum(x * x.index), # calculate centrality as sum(eij*ext) @@ -178,14 +181,16 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz 'ext': 'CallonDensity' }) .assign( - CallonDensity=lambda x: x['CallonDensity'] / x['n'] * 100, + CallonDensity=lambda x: x['CallonDensity'] / x['n'].astype(float) * 100, RankCentrality=lambda x: x['CallonCentrality'].rank(), RankDensity=lambda x: x['CallonDensity'].rank() ) - .merge(df_lab_top, on='groups', how='left') - .rename(columns={'cluster_label': 'Cluster', 'freq': 'ClusterFrequency'}) .reset_index() + .rename(columns={'cluster_label': 'Cluster', 'freq': 'ClusterFrequency'}) + .merge(df_lab_top, left_on='groups1', right_on='groups', how='left') + .rename(columns={'cluster_label':'Cluster', 'freq':'ClusterFrequency'}) ) + print("df columns:", df.columns.tolist()) # Calculate plot parameters meandens = df['RankDensity'].mean() @@ -320,7 +325,14 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz ############################################################################################################################################## # Rename and rearrange columns in df_lab - df_lab.columns = ['Cluster', 'Cluster_Frequency', 'Cluster_Label', 'Occurrences', 'Words', 'Color'] + df_lab = df_lab.rename(columns = { + 'freq': 'Cluster_Frequency', + 'cluster_label':'Cluster_Label', + 'sC':'Occurrences', + 'words':'Words', + 'color':'Color', + 'groups':'Cluster' + }) df_lab = (df_lab .sort_values('Cluster') .dropna(subset=['Color']) @@ -396,7 +408,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz # Generate layout # Using default igraph layout - layout = Net['graph']['layout'] + layout = Net['graph'].layout_fruchterman_reingold() # Get coordinates from layout coords = np.array([[pos[0], pos[1]] for pos in layout]) @@ -560,7 +572,11 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, threshold=0.5): + print("cluster_assignment called, field:", field) + print("cluster_assignment M columns:", M.columns.tolist()) # Integrate stopwords and synonyms in M original field + from www.services.metatagextraction import metaTagExtraction + M = metaTagExtraction(M, "SR") if field in ["AB", "TI"]: field = f"{field}_TM" @@ -571,13 +587,22 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh all_terms = [] all_sr = [] + if 'SR' not in M.columns: + from www.services.metatagextraction import metaTagExtraction + M = metaTagExtraction(M, "SR") + # Iterate through each row for i, terms_list in enumerate(Fi): if isinstance(terms_list, list): - for term in terms_list: - if term: # Skip empty terms - all_terms.append(term.strip()) - all_sr.append(M['SR'].iloc[i]) + items = terms_list + elif isinstance(terms_list, str): + items = [t.strip() for t in terms_list.split(';')] + else: + items = [] + for term in items: + if term: # Skip empty terms + all_terms.append(term.strip()) + all_sr.append(M['SR'].iloc[i]) all_field = pd.DataFrame({ 'terms': all_terms, @@ -605,7 +630,7 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh # Process words dataframe words = words.assign( - p_w=1/words['Occurrences'], + p_w=1/words['Occurrences'].astype(float), p_c=words['pagerank_centrality'] ) @@ -623,13 +648,13 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh ) # Calculate probabilities - terms = (terms.groupby('SR') - .apply(lambda x: x.assign(pagerank=x['p_c'].sum())) - .reset_index(drop=True) - .groupby(['SR', 'Cluster_Label']) - .agg({'p_w': 'sum', 'p_c': 'max'}) + print("before groupby SR Cluster_Label, terms columns:", terms.columns.tolist()) + terms['pagerank'] = terms.groupby('SR')['p_c'].transform('sum') + terms = (terms.groupby(['SR', 'Cluster_Label']) + .agg({'p_w': 'sum', 'p_c':'max'}) .reset_index() - .rename(columns={'p_c': 'pagerank'})) + .rename(columns={'p_c':'pagerank'})) + print("after groupby terms columns:", terms.columns.tolist()) terms['p'] = terms['p_w'] / terms.groupby('SR')['p_w'].transform('sum') terms = terms.dropna(subset=['Cluster_Label']).drop('p_w', axis=1) @@ -645,20 +670,26 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh terms_pagerank = (terms.merge(terms_max, on='SR') .query('Cluster_Label == Assigned_cluster')[['SR', 'pagerank']]) + print("before pivot, terms columns:", terms.columns.tolist()) + # Pivot and merge results terms = (terms.drop('pagerank', axis=1) .pivot(index='SR', columns='Cluster_Label', values='p') .reset_index() # Ensure SR is only a column .rename_axis(None, axis=1) # Remove any index name ) + print("after pivot, terms columns:", terms.columns.tolist()) # Now merge with terms_max and terms_pagerank terms = terms.merge(terms_max, on='SR').merge(terms_pagerank, on='SR') - + print("after merge terms_max, terms columns:", terms.columns.tolist()) # Process final results if 'DI' not in M.columns: M['DI'] = np.nan year = pd.Timestamp.now().year + 1 + print("terms columns before final merge:", terms.columns.tolist()) + print("SR_cited in terms:", 'SR' in terms.columns) + M = M.reset_index(drop=True) terms = (M.assign( TCpY=lambda x: x['TC']/(year-x['PY']), @@ -669,5 +700,6 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh .groupby('Assigned_cluster') .apply(lambda x: x.sort_values('TC', ascending=False)) .reset_index(drop=True)) + print("terms done") return terms From 08fd768a50751a5bca8aa5a64873ed3b3343c16c Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Thu, 11 Jun 2026 12:59:45 +0200 Subject: [PATCH 8/9] Fix PubMed mapping in standardizer.py --- pubmed_test.csv | 21 +++++++++++++++++++++ www/services/etl/standardizer.py | 7 ++++++- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 pubmed_test.csv diff --git a/pubmed_test.csv b/pubmed_test.csv new file mode 100644 index 000000000..adcafa3cf --- /dev/null +++ b/pubmed_test.csv @@ -0,0 +1,21 @@ +TI,AB,PY,SO,JI,VL,IS,BP,EP,DI,UT,PMID,DT,LA,TC,AU,AF,C1,CR,DE,ID,RP,DB,SR +SIFA: A two-stage adaptive ensemble framework for solar irradiance forecasting,Accurate solar irradiance forecasting is increasingly crucial for managing solar,2026,Scientific reports,Sci Rep,16,1,,,17998 [pii];10.1038/s41598-026-53183-2 [doi],42271158,42271158,Journal Article,eng,0,"['Abdel-Basset M', 'Mohamed R', 'Alrashdi I', 'Mahdi M']","['Abdel-Basset, Mohamed', 'Mohamed, Reda', 'Alrashdi, Ibrahim', 'Mahdi, Mahmoud']","['Faculty of Computers and Informatics, Zagazig University, Zagazig, 44519,', 'Faculty of Computers and Informatics, Zagazig University, Zagazig, 44519,', 'Department of Computer Science, College of Computer and Information Sciences,', 'Faculty of Computers and Informatics, Zagazig University, Zagazig, 44519,']",[],[],[],,PUBMED,"Abdel-Basset M, 2026, Scientific reports" +Stabilizing-sensing synergistic geogrid for high-speed railways.,High-speed railway systems demand robust substructure performance and real-time,2026,Nature communications,Nat Commun,,,,,10.1038/s41467-026-74260-0 [doi],42271141,42271141,Journal Article,eng,0,"['Fu H', 'Zhang H', 'Xiao L', 'Zhan W', 'Jiang J', 'Chen Y', 'Bian X']","['Fu, Haoran', 'Zhang, Haoyu', 'Xiao, Liuyang', 'Zhan, Wenhao', 'Jiang, Jianqun', 'Chen, Yunmin', 'Bian, Xuecheng']","['Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang']",[],[],[],,PUBMED,"Fu H, 2026, Nature communications" +Morphologically tunable mycelium chips for physical reservoir computing.,We introduce a neuromorphic computing substrate based on PEDOT:PSS-infused,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-55550-5 [doi],42271130,42271130,Journal Article,eng,0,"['Telhan O', 'Winiski J', 'Schaak D', 'Siegel M', 'Petrillo N', 'Bayer E']","['Telhan, Orkan', 'Winiski, Jake', 'Schaak, Damen', 'Siegel, Michael', 'Petrillo, Neale', 'Bayer, Eben']","['Ecovative LLC, Green Island, NY, 12183, USA. orkan@design.bio.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.']",[],[],[],,PUBMED,"Telhan O, 2026, Scientific reports" +Transforming hemodialysis care: a tripartite collaboration model among medical,Hemodialysis demand is rising as populations age and the chronic kidney disease,2026,Clinical and experimental nephrology,Clin Exp Nephrol,,,,,10.1007/s10157-026-02903-z [doi],42271122,42271122,Journal Article;Review,eng,0,"['Noda R', 'Sakurada T', 'Ichikawa D', 'Shibagaki Y']","['Noda, Ryunosuke', 'Sakurada, Tsutomu', 'Ichikawa, Daisuke', 'Shibagaki, Yugo']","['Division of Nephrology and Hypertension, Department of Internal Medicine, St.', 'Division of Nephrology and Hypertension, Department of Internal Medicine, St.', 'Division of Nephrology and Hypertension, Department of Internal Medicine, St.', 'Division of Nephrology and Hypertension, Department of Internal Medicine, St.']",[],[],[],,PUBMED,"Noda R, 2026, Clinical and experimental nephrology" +Deep Learning-Based Metal Artifact Reduction in Cardiac Computed Tomography: A,Idiopathic ventricular fibrillation (IVF) affects 5-10% of out-of-hospital,2026,Journal of imaging informatics in medicine,J Imaging Inform Med,,,,,10.1007/s10278-026-02030-x [doi],42271106,42271106,Journal Article,eng,0,"['Benigni N', 'Lo Iacono F', 'Verheul LM', 'Guglielmo M', 'Volders P', 'Ter Bekke R', 'Pontone G', 'Hassink RJ', 'Corino VDA']","['Benigni, Nicholas', 'Lo Iacono, Francesca', 'Verheul, Lisa M', 'Guglielmo, Marco', 'Volders, Paul', 'Ter Bekke, Rachel', 'Pontone, Gianluca', 'Hassink, Rutger J', 'Corino, Valentina D A']","['CardioTechLab, Centro Cardiologico Monzino IRCCS, Milan, Italy.', 'Department of Electronics, Information and Bioengineering, Politecnico Di Milano', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Biomedical, Surgical and Dental Sciences, University of Milan,', 'Perioperative Cardiology and Cardiovascular Imaging Department, Centro', 'Department of Cardiology, Cardiovascular Research Institute Maastricht (CARIM),', 'CardioTechLab, Centro Cardiologico Monzino IRCCS, Milan, Italy.', 'Department of Electronics, Information and Bioengineering, Politecnico Di Milano']",[],[],[],,PUBMED,"Benigni N, 2026, Journal of imaging informatics in medicine" +Spatiotemporal trends of foot and mouth disease (FMD) in Bangladesh from 2017 to,Foot-and-mouth disease (FMD) is a highly contagious transboundary viral disease,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-57440-2 [doi],42271042,42271042,Journal Article,eng,0,"['Ahmed MJ', 'Alam KE', 'Talukder F', 'Bhandari P', 'Bhuiyan MIH', 'Mamun M', 'Rahman MA', 'Chalise R', 'Hossain MI', 'Morshed M', 'Sabrin MS', 'Chowdhury MTI', 'Alam MJ', 'Hossain D']","['Ahmed, Md Jisan', 'Alam, Kazi Estieque', 'Talukder, Faisol', 'Bhandari, Prajwal', 'Bhuiyan, Md Ismile Hossain', 'Mamun, Mustakim', 'Rahman, Md Arifur', 'Chalise, Ritu', 'Hossain, Md Imran', 'Morshed, Moheuddin', 'Sabrin, Mirza Synthia', 'Chowdhury, Md Tazul Islam', 'Alam, Md Jahangir', 'Hossain, Delower']","['Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Pathology, Faculty of Animal Science and Veterinary Medicine,', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Livestock Service (DLS), Dhaka, 1212, Bangladesh.', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Microbiology and Parasitology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Animal Production and Management, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Animal Production and Management, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Surgery and Theriogenology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Microbiology and Parasitology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Pathology, Faculty of Animal Science and Veterinary Medicine,', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Microbiology and Parasitology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Agricultural Chemistry, Faculty of Agriculture, Sher-e-Bangla', 'Department of Animal Production and Management, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Medicine and Public Health, Faculty of Animal Science and']",[],[],[],,PUBMED,"Ahmed MJ, 2026, Scientific reports" +Machine learning-assisted design of a wideband Fe-SiO(2)-MXene metamaterial solar,The manuscript proposed an efficient broadband metamaterial-inspired multilayered,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-57447-9 [doi],42271013,42271013,Journal Article,eng,0,"['Lavadiya S', 'Sorathiya V', 'Jaffar AY', 'Alsayegh AB', 'Khayyat KMJ']","['Lavadiya, Sunil', 'Sorathiya, Vishal', 'Jaffar, Amar Y', 'Alsayegh, Abdulghani Bakur', 'Khayyat, Khalid M Jamil']","['Department of Information and Communication Technology, Marwadi University,', 'Faculty of Engineering and Technology, Parul Institute of Engineering and', 'Computer and Network Engineering Department, College of Computing, Umm Al-Qura', 'Computer and Network Engineering Department, College of Computing, Umm Al-Qura', 'Computer and Network Engineering Department, College of Computing, Umm Al-Qura']",[],[],[],,PUBMED,"Lavadiya S, 2026, Scientific reports" +Human migration has surged since 2000 - these maps reveal where people are going.,,2026,Nature,Nature,,,,,10.1038/d41586-026-01796-y [doi],42271002,42271002,News,eng,0,['Naddaf M'],"['Naddaf, Miryam']",[],[],[],[],,PUBMED,"Naddaf M, 2026, Nature" +People are turning to AI chatbots to plug gaps in health information.,,2026,Nature,Nature,,,,,10.1038/d41586-026-01737-9 [doi],42270995,42270995,News,eng,0,['Gerstung M'],"['Gerstung, Moritz']",[],[],[],[],,PUBMED,"Gerstung M, 2026, Nature" +How I use AI to turn failed drugs into new medicines.,,2026,Nature,Nature,,,,,10.1038/d41586-026-01626-1 [doi],42270981,42270981,News,eng,0,['Ulker E'],"['Ulker, Emma']",[],[],[],[],,PUBMED,"Ulker E, 2026, Nature" +An active learning workflow for predicting misfit volume in body-centered cubic,Refractory high-entropy alloys (RHEAs) exhibit exceptional high-temperature,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-57006-2 [doi],42270978,42270978,Journal Article,eng,0,"['Liu S', 'Balachandran PV']","['Liu, Shunshun', 'Balachandran, Prasanna V']","['University of Virginia, Department of Materials Science and Engineering,', 'University of Virginia, Department of Materials Science and Engineering,']",[],[],[],,PUBMED,"Liu S, 2026, Scientific reports" +SERPINA3 and NDRG1 are critical diagnostic immune genes associated with,Preeclampsia (PE) is a pregnancy complication involving immune dysregulation.,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56930-7 [doi],42270919,42270919,Journal Article,eng,0,"['Wu Z', 'Chen S', 'Chen W', 'Xie Y', 'Zhou Z', 'Huang L', 'Wang Y', 'Chen B', 'Yang C', 'Ke Y']","['Wu, Zhuna', 'Chen, Shihong', 'Chen, Weihong', 'Xie, Yajing', 'Zhou, Zhimei', 'Huang, Li', 'Wang, Yueli', 'Chen, Binbin', 'Yang, Congmei', 'Ke, Yumin']","['Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian']",[],[],[],,PUBMED,"Wu Z, 2026, Scientific reports" +Hetairos is a histology-based artificial intelligence model for predicting,Molecular testing is essential for classifying central nervous system (CNS),2026,Nature cancer,Nat Cancer,,,,,10.1038/s43018-026-01186-3 [doi],42270902,42270902,Journal Article,eng,0,"['Jin D', 'Shmatko A', 'Patel A', 'Rutz S', 'Friedrich L', 'Banan R', 'Rahmanzade R', 'Sievers P', 'Hamelmann S', 'Schrimpf D', 'Gobel K', 'Bogumil H', 'Maas SLN', 'Sill M', 'Hinz FE', 'Suwala AK', 'Keller F', 'Habel A', 'Rukhovich G', 'Zettl F', 'Alhalabi OT', 'Ille S', 'Sehring J', 'Amsel D', 'Wiestler B', 'Piovesan Lago P', 'Suchorska B', 'Ahmad O', 'Sturm D', 'Reuss D', 'Wesseling P', 'Wohrer A', 'Heppner FL', 'Blumcke I', 'Delbridge C', 'Jakobs M', 'Herold-Mende C', 'Krieg SM', 'Wick W', 'Jones DTW', 'Pfister SM', 'Al-Hussaini M', 'Hou Y', ""D'Almeida Costa F"", 'Schweizer L', 'Bertero L', 'Acker T', 'Tauziede-Espariat A', 'Varlet P', 'Merkler D', 'Egervari K', 'Dohmen H', 'Zoroquiain P', 'Gejman R', 'Brandner S', 'Bai X', 'von Deimling A', 'Sahm F', 'Gerstung M']","['Jin, Darui', 'Shmatko, Artem', 'Patel, Areeba', 'Rutz, Samuel', 'Friedrich, Lukas', 'Banan, Rouzbeh', 'Rahmanzade, Ramin', 'Sievers, Philipp', 'Hamelmann, Stefan', 'Schrimpf, Daniel', 'Gobel, Kirsten', 'Bogumil, Henri', 'Maas, Sybren L N', 'Sill, Martin', 'Hinz, Felix E', 'Suwala, Abigail K', 'Keller, Felix', 'Habel, Antje', 'Rukhovich, Gleb', 'Zettl, Ferdinand', 'Alhalabi, Obada T', 'Ille, Sebastian', 'Sehring, Jannik', 'Amsel, Daniel', 'Wiestler, Benedikt', 'Piovesan Lago, Pedro', 'Suchorska, Bogdana', 'Ahmad, Olfat', 'Sturm, Dominik', 'Reuss, David', 'Wesseling, Pieter', 'Wohrer, Adelheid', 'Heppner, Frank L', 'Blumcke, Ingmar', 'Delbridge, Claire', 'Jakobs, Martin', 'Herold-Mende, Christel', 'Krieg, Sandro M', 'Wick, Wolfgang', 'Jones, David T W', 'Pfister, Stefan M', 'Al-Hussaini, Maysa', 'Hou, Yanghao', ""D'Almeida Costa, Felipe"", 'Schweizer, Leonille', 'Bertero, Luca', 'Acker, Till', 'Tauziede-Espariat, Arnault', 'Varlet, Pascale', 'Merkler, Doron', 'Egervari, Kristof', 'Dohmen, Hildegard', 'Zoroquiain, Pablo', 'Gejman, Roger', 'Brandner, Sebastian', 'Bai, Xiangzhi', 'von Deimling, Andreas', 'Sahm, Felix', 'Gerstung, Moritz']","['Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Image Processing Center, Beihang University, Beijing, China.', 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Faculty of Biosciences, Heidelberg University, Heidelberg, Germany.', 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Faculty of Mathematics and Computer Science, Heidelberg University, Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Pathology, Leiden University Medical Center, Leiden, The', 'Department of Pathology, Brain Tumor Center, Erasmus MC Cancer Institute,', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Neurooncology, German Cancer Research Center (DKFZ) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Faculty of Mathematics and Computer Science, Heidelberg University, Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'AI for Image-Guided Diagnosis and Therapy, School of Medicine and Health,', 'Munich Center for Machine Learning (MCML), Munich, Germany.', 'AC Camargo Cancer Center, Sao Paulo, Brazil.', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Glioma Research, German Cancer Research Center (DKFZ),', 'Department of Pediatric Hematology and Oncology, University Hospital Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Princess Maxima Center for Pediatric Oncology, Utrecht, The Netherlands.', 'Department of Pathology, Amsterdam University Medical Centers/VUmc, Amsterdam,', 'Division of Neuropathology and Neurochemistry, Department of Neurology,', 'Institute of Neuropathology and Neuromolecular Pathology, Medical University of', 'Department of Neuropathology, Charite-Universitatsmedizin Berlin, corporate', 'German Center for Neurodegenerative Diseases (DZNE) within the Helmholtz', 'Department of Neuropathology, University Hospital Erlangen, Friedrich-Alexander', 'Institute of Pathology, School of Medicine and Health, Technical University of', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Division for Stereotactic Neurosurgery, Department of Neurosurgery, University', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Neurology Clinic, University Hospital Heidelberg, Heidelberg, Germany.', 'Clinical Cooperation Unit Neurooncology, German Cancer Consortium (DKTK) and', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Glioma Research, German Cancer Research Center (DKFZ),', 'National Center for Tumor Diseases (NCT), Heidelberg, Germany.', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Neurooncology, German Cancer Research Center (DKFZ) and', 'Department of Pediatric Hematology and Oncology, University Hospital Heidelberg,', 'National Center for Tumor Diseases (NCT), Heidelberg, Germany.', 'Department of Cell Therapy and Applied Genomics, King Hussein Cancer Center,', 'Department of Pathology and Laboratory Medicine, King Hussein Cancer Center,', 'Department of Pathology, Center for Molecular Medicine Testing, College of Basic', 'Center for Medical Epigenetics, School of Basic Medical Sciences, Chongqing', 'AC Camargo Cancer Center, Sao Paulo, Brazil.', 'DASA Laboratories, Sao Paulo, Brazil.', 'Edinger Institute, Institute of Neurology, University of Frankfurt am Main,', 'German Cancer Consortium (DKTK) Partner Site Frankfurt/Mainz and German Cancer', 'Frankfurt Cancer Institute (FCI), Frankfurt am Main, Germany.', 'Department of Medical Sciences, University of Turin, Turin, Italy.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'Department of Neuropathology, Sainte-Anne Hospital, Paris, France.', 'Inserm, UMR 1266, IMA-Brain, Institut de Psychiatrie et Neurosciences de Paris,', 'Department of Neuropathology, Sainte-Anne Hospital, Paris, France.', 'Inserm, UMR 1266, IMA-Brain, Institut de Psychiatrie et Neurosciences de Paris,', 'Department of Pathology and Immunology, University of Geneva, Geneva,', 'Division of Clinical Pathology, Geneva University Hospital, Geneva, Switzerland.', 'Department of Pathology and Immunology, University of Geneva, Geneva,', 'Division of Clinical Pathology, Geneva University Hospital, Geneva, Switzerland.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'Pathology Department, Faculty of Medicine, Pontificia Universidad Catolica de', 'Pathology Department, Faculty of Medicine, Pontificia Universidad Catolica de', 'Department of Neurodegenerative Disease, UCL Queen Square Institute of Neurology,', 'Image Processing Center, Beihang University, Beijing, China.', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Faculty of Mathematics and Computer Science, Heidelberg University, Heidelberg,']",[],[],[],,PUBMED,"Jin D, 2026, Nature cancer" +StyleGAN-based synthetic image augmentation for multi-class otoscopy image,Accurate diagnosis of eardrum abnormalities is pivotal for effectively managing,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56954-z [doi],42270897,42270897,Journal Article,eng,0,"['Camalan S', 'Langefeld CD', 'Zinnia A', 'Moberly AC', 'Gurcan MN']","['Camalan, Seda', 'Langefeld, Carl D', 'Zinnia, Amy', 'Moberly, Aaron C', 'Gurcan, Metin N']","['Center for Artificial Intelligence Research, Wake Forest University School of', 'Center for Artificial Intelligence Research, Wake Forest University School of', 'Biostatistics and Data Science, Wake Forest University School of Medicine,', 'Public Health Sciences, Wake Forest University School of Medicine, Winston-Salem,', 'Biostatistics and Data Science, Wake Forest University School of Medicine,', 'Dept. of Otolaryngology-Head and Neck Surgery, Vanderbilt University Medical', 'Center for Artificial Intelligence Research, Wake Forest University School of']",[],[],[],,PUBMED,"Camalan S, 2026, Scientific reports" +Spatiotemporal evolution and driving factors of water conservation capacity in,Water conservation services serve as a pivotal ecosystem service for water,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56962-z [doi],42270863,42270863,Journal Article,eng,0,"['Hou H', 'Guo F', 'Wang P', 'Lu D', 'Chen C', 'Bai J', 'Li H', 'Bao Z', 'Qin M', 'Liu Y', 'Fan X']","['Hou, Huimin', 'Guo, Feng', 'Wang, Pengquan', 'Lu, Di', 'Chen, Changjie', 'Bai, Junxing', 'Li, Haohao', 'Bao, Zhiqiang', 'Qin, Mingyang', 'Liu, Yufei', 'Fan, Xinjian']","['Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Qinghai Minzu University, Xining, 810007, Qinghai Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.']",[],[],[],,PUBMED,"Hou H, 2026, Scientific reports" +Detecting application layer DDoS attack using an advanced signature detection,Application-layer Distributed Denial of Service (App-DDoS) attacks are an ongoing,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56617-z [doi],42270859,42270859,Journal Article,eng,0,"['Jaafar AG', 'Ngadi MA', 'Kama N', 'Kamarudin NH', 'Shapawi K']","['Jaafar, Abdul Ghafar', 'Ngadi, Md Asri', 'Kama, Nazri', 'Kamarudin, Nazhatul Hafizah', 'Shapawi, Khairol']","['Faculty of Artificial Intelligence, Universiti Teknologi Malaysia, Kuala Lumpur,', 'Faculty of Computing, Universiti Teknologi Malaysia, Johor Bahru, Johor,', 'Faculty of Artificial Intelligence, Universiti Teknologi Malaysia, Kuala Lumpur,', 'Centre for Cyber Security, Faculty of Information Science and Technology,', 'KYROL Security Labs Sdn. Bhd, Cyberjaya, Malaysia.']",[],[],[],,PUBMED,"Jaafar AG, 2026, Scientific reports" +An IBGWO optimized feature selection framework for sentiment analysis-based,Detecting fraudulent websites is critical to ensuring network security and,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56748-3 [doi],42270851,42270851,Journal Article,eng,0,"['Perumal S', 'Vishwanathan AJ']","['Perumal, Saraswathi', 'Vishwanathan, Anchitaalagammai Jayalakshmi']","['Department of Information Technology, Velammal College of Engineering and', 'Department of Computer Science and Engineering (Cyber Security), Velammal College']",[],[],[],,PUBMED,"Perumal S, 2026, Scientific reports" +Bulk and single-cell transcriptomics reveal prognostic signatures of,Lung adenocarcinoma (LUAD) is one of the most severe malignant tumors.,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56684-2 [doi],42270842,42270842,Journal Article,eng,0,"['Zhou L', 'Lei P', 'Luo Z', 'Xiao J', 'Chen Z']","['Zhou, Lihua', 'Lei, Peng', 'Luo, Zhouguang', 'Xiao, Jie', 'Chen, Zongyu']","['Department of Pulmonary and Critical Care Medicine, Affiliated Hospital of', 'Department of Neurosurgery, Affiliated Hospital of Guizhou Medical University,', ""Department of Infectious Disease, Longgang People's Hospital (The Longgang Branch"", 'Department of Emergency, Affiliated Hospital of Guizhou Medical University, No.', 'Department of Pulmonary and Critical Care Medicine, Affiliated Hospital of']",[],[],[],,PUBMED,"Zhou L, 2026, Scientific reports" +Machine learning-enabled ECG arrhythmia classification: a systematic and,Electrocardiogram (ECG) signals play a critical role in the early detection of,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56828-4 [doi],42270808,42270808,Journal Article,eng,0,['Melek N'],"['Melek, Negin']","['Faculty of Engineering and Natural Sciences, Gumushane University, Gumushane,']",[],[],[],,PUBMED,"Melek N, 2026, Scientific reports" +Integrative single-cell and spatial transcriptomics with machine learning,Triple-negative breast cancer is marked by extensive cellular heterogeneity and,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56434-4 [doi],42270806,42270806,Journal Article,eng,0,"['Wu J', 'Fan J', 'Sha T', 'Li H']","['Wu, Jinpeng', 'Fan, Jingjing', 'Sha, Tong', 'Li, Hongtao']","['Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang', 'Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang', 'Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang', 'Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang']",[],[],[],,PUBMED,"Wu J, 2026, Scientific reports" diff --git a/www/services/etl/standardizer.py b/www/services/etl/standardizer.py index 579af14e4..a9d40725a 100644 --- a/www/services/etl/standardizer.py +++ b/www/services/etl/standardizer.py @@ -99,7 +99,12 @@ def convert2df( raise ValueError(f"API not supported for source: '{source}'. Supported API sources: {list(API_SOURCES.keys())}") retriever = API_SOURCES[source] df = retriever(query=query, max_results=max_results) - mapping = [] + if source == "pubmed": + mapping = PUBMED_MAPPING + elif source == "openalex": + mapping = OPENALEX_MAPPING + else: + mapping = {} db_name = source.upper() elif filepath is not None: if source not in FILE_SOURCES: From aa4bd7a184eb8befebb2bee9c28051170b386f01 Mon Sep 17 00:00:00 2001 From: Mario Losco Date: Thu, 11 Jun 2026 13:02:47 +0200 Subject: [PATCH 9/9] Add pubmed test files to gitignore --- .gitignore | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index 2815e7977..097e5d010 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,7 @@ pubmed_standardized.csv test_etl.py # Jupyter checkpoints -.ipynb_checkpoints/ \ No newline at end of file +.ipynb_checkpoints/ + +pubmed_test.csv +pubmed_test.txt \ No newline at end of file