From b8d9d281b5d6a1ecb3b9a1a395842548923b70f8 Mon Sep 17 00:00:00 2001 From: Vamsi-CHVVK Date: Fri, 12 Jun 2026 16:04:26 +0200 Subject: [PATCH] completed --- ETL_Execution_Evidence.ipynb | 85 +++++++++++ app.py | 276 ++++++++++++++++++++-------------- execution_log.txt | Bin 0 -> 1762 bytes functions/get_data.py | 11 +- generate_notebook.py | 91 +++++++++++ run.bat | 5 + standardized_api_data.csv | 101 +++++++++++++ test_etl.py | 85 +++++++++++ test_histnetwork.py | 35 +++++ test_perf.py | 28 ++++ www/services/__init__.py | 6 +- www/services/api_retriever.py | 125 +++++++++++++++ www/services/etl.py | 128 ++++++++++++++++ www/services/histnetwork.py | 112 ++++++++------ www/services/standardizer.py | 177 ++++++++++++++++++++++ www/services/validator.py | 67 +++++++++ 16 files changed, 1168 insertions(+), 164 deletions(-) create mode 100644 ETL_Execution_Evidence.ipynb create mode 100644 execution_log.txt create mode 100644 generate_notebook.py create mode 100644 run.bat create mode 100644 standardized_api_data.csv create mode 100644 test_etl.py create mode 100644 test_histnetwork.py create mode 100644 test_perf.py create mode 100644 www/services/api_retriever.py create mode 100644 www/services/etl.py create mode 100644 www/services/standardizer.py create mode 100644 www/services/validator.py diff --git a/ETL_Execution_Evidence.ipynb b/ETL_Execution_Evidence.ipynb new file mode 100644 index 000000000..b09a4c722 --- /dev/null +++ b/ETL_Execution_Evidence.ipynb @@ -0,0 +1,85 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ETL Pipeline Execution Evidence\n", + "This notebook demonstrates the execution of the custom ETL pipeline retrieving data from the OpenAlex API, standardizing it, validating it, and preparing it for the Bibliometrix dashboard." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import pandas as pd\n", + "# Ensure local modules can be imported\n", + "sys.path.append(os.path.abspath(\".\"))\n", + "\n", + "from www.services.etl import ETLPipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Live Query Execution via API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"machine learning\"\n", + "print(f\"Executing live API query to OpenAlex for: {query}...\")\n", + "df_standardized = ETLPipeline.convert2df(source_data=\"API\", source_type=\"OpenAlex\", is_api=True, query=query)\n", + "print(f\"\\nSuccessfully retrieved and standardized {len(df_standardized)} records.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Validation and Normalized Output\n", + "Displaying the first 5 normalized rows demonstrating standard Web of Science columns (e.g., UT, TI, CR, PY)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pd.set_option(\"display.max_columns\", None)\n", + "df_standardized.head(5)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/app.py b/app.py index f0891f894..c5f7226b6 100644 --- a/app.py +++ b/app.py @@ -854,8 +854,66 @@ def indicator_types_ui_all(): ), with ui.nav_panel("None", value="API"): - ui.h3("🚧 Warning: API is under construction 🚧") - + ui.h3("🌐 API Data Extraction", style="color: #5567BB;") + ui.p("Fetch data directly from Open-Access APIs (OpenAlex or PubMed) to instantly begin your bibliometric analysis.") + + with ui.layout_sidebar(fillable=False, fill=False): + with ui.sidebar(id="sidebar_api", position="right"): + ui.h4("API Options", style="color: #5567BB;") + ui.input_select("api_source", "Select API Platform", choices={"OpenAlex": "OpenAlex", "PubMed": "PubMed"}, selected="OpenAlex") + ui.input_text("api_query", "Search Query (e.g. 'machine learning')", value="") + ui.input_action_button("btn_fetch_api", "Fetch Data", class_="btn-primary", style="margin-top: 10px; background-color: #5567BB; color: white; width: 100%;") + + with ui.card(): + ui.h4("API Extraction Status", style="color: #5567BB;") + @render.express() + def api_status_message(): + if input.btn_fetch_api() == 0: + ui.p("Enter a query and click 'Fetch Data' to begin. The data will automatically be standardized to the Web of Science format.") + else: + ui.p(f"Fetched data from {input.api_source()} for query: '{input.api_query()}'") + + @render.data_frame + @reactive.event(input.btn_fetch_api) + def api_preview_table(): + if not input.api_query(): + return pd.DataFrame([{"Message": "Please enter a search query."}]) + + m = ui.modal( + ui.div( + ui.img(src="https://cisslaboral.laleynext.es/Img/loader-circle.gif", height="150px", style="display: block; margin: 0 auto;"), + ui.h4(f"Fetching data from {input.api_source()}...", style="text-align: center;") + ), + easy_close=False, footer=None + ) + ui.modal_show(m) + + try: + # Run our ETL pipeline + res_df = ETLPipeline.convert2df( + source_data="API", + source_type=input.api_source(), + is_api=True, + query=input.api_query() + ) + # Update global state + df.set(res_df) + + # Serialize to CSV for physical file output + csv_df = res_df.copy() + for col in csv_df.columns: + csv_df[col] = csv_df[col].apply(lambda x: ";".join(str(i) for i in x) if isinstance(x, list) else x) + csv_df.to_csv("standardized_api_data.csv", index=False) + + reset_all_analyses() + ui.modal_remove() + ui.notification_show(f"Successfully extracted {len(res_df)} records! Saved to 'standardized_api_data.csv'.", type="message", duration=5) + + return res_df.head(20) + except Exception as e: + ui.modal_remove() + ui.notification_show(f"Error fetching data: {str(e)}", type="error", duration=10) + return pd.DataFrame([{"Error": str(e)}]) with ui.nav_panel("None", value="collections"): ui.h3("🚧 Warning: Merge Collection is under construction 🚧") @@ -2104,9 +2162,7 @@ def sources_production_placeholder(): ui.p("Click the Run Analysis button to generate the sources' production over time visualization.", style="text-align: center; color: #666; font-size: 16px;"), style="height: 400px; display: flex; flex-direction: column; justify-content: center; align-items: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) - # Render the widget directly when result is available - plot_sources_production, _ = result - return plot_sources_production + return None # Hide placeholder when data is available @render_widget def show_sources_production(): @@ -2251,9 +2307,7 @@ def relevant_authors_placeholder(): ui.p("Click the Run Analysis button to generate the most relevant authors visualization.", style="text-align: center; color: #666; font-size: 16px;"), style="height: 400px; display: flex; flex-direction: column; justify-content: center; align-items: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) - # Render the widget directly when result is available - plot_relevant_authors, _ = result - return plot_relevant_authors + return None # Hide placeholder when data is available @render_widget def show_relevant_authors(): @@ -2399,9 +2453,7 @@ def local_cited_authors_placeholder(): ui.p("Click the Run Analysis button to generate the most local cited authors visualization.", style="text-align: center; color: #666; font-size: 16px;"), style="height: 400px; display: flex; flex-direction: column; justify-content: center; align-items: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) - # Render the widget directly when result is available - plot_local_cited_authors, _ = result - return plot_local_cited_authors + return None # Hide placeholder when data is available @render_widget def show_local_cited_authors(): @@ -2544,9 +2596,7 @@ def authors_production_placeholder(): ui.p("Click the Run Analysis button to generate the authors' production over time visualization.", style="text-align: center; color: #666; font-size: 16px;"), style="height: 400px; display: flex; flex-direction: column; justify-content: center; align-items: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) - # Render the widget directly when result is available - plot_authors_production, _, _ = result - return plot_authors_production + return None # Hide placeholder when data is available @render_widget def show_authors_production(): @@ -2861,9 +2911,7 @@ def authors_local_impact_placeholder(): ui.p("Click the Run Analysis button to generate the authors' local impact visualization.", style="text-align: center; color: #666; font-size: 16px;"), style="height: 400px; display: flex; flex-direction: column; justify-content: center; align-items: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) - # Render the widget directly when result is available - plot_authors_local_impact, _ = result - return plot_authors_local_impact + return None # Hide placeholder when data is available @render_widget def show_authors_local_impact(): @@ -3008,9 +3056,7 @@ def relevant_affiliations_placeholder(): ui.p("Click the Run Analysis button to generate the most relevant affiliations visualization.", style="text-align: center; color: #666; font-size: 16px;"), style="height: 400px; display: flex; flex-direction: column; justify-content: center; align-items: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) - # Render the widget directly when result is available - plot_relevant_affiliations, _ = result - return plot_relevant_affiliations + return None # Hide placeholder when data is available @render_widget def show_relevant_affiliations(): @@ -8185,99 +8231,101 @@ def update_plot_settings(): # --- Sidebar Management --- @render.express() -@reactive.event(input.start_button) def toggle_sidebar(): - with ui.tags.div(id="sidebar_2", class_="custom-sidebar"): - with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False): - # Info Section - with ui.accordion_panel("Biblioshiny", icon=ICONS["home_colored"]): - ui.input_action_button("go_about_2", "Biblioshiny", class_="sidebar-button", icon=ICONS["home"]) - # Data Section - with ui.accordion_panel("Data", icon=ICONS["database_colored"]): - ui.input_action_button("go_import_2", "Import or Load", class_="sidebar-button", icon=ICONS["data"]) - ui.input_action_button("go_api_2", "API", class_="sidebar-button", icon=ICONS["api"]) - ui.input_action_button("go_collections_2", "Merge Collection", class_="sidebar-button", icon=ICONS["merge"]) - - # Filters Section - with ui.accordion_panel("Filters", icon=ICONS["filters_colored"]): - ui.input_action_button("go_filters", "Filters", class_="sidebar-button", icon=ICONS["filters"]) - - # Analysis Section - with ui.accordion_panel("Overview", icon=ICONS["play_colored"]): - ui.input_action_button("go_main", "Main Information", class_="sidebar-button", icon=ICONS["overview"]) - ui.input_action_button("go_annual_scientific_production", "Annual Scientific Production", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - ui.input_action_button("go_average_citations_per_year", "Average Citations per Year", class_="sidebar-button", icon=ICONS["average_citations_per_doc"]) - ui.input_action_button("go_three_field_plot", "Three-Field Plot", class_="sidebar-button", icon=ICONS["overview"]) - with ui.accordion_panel("Sources", icon=ICONS["sources_colored"]): - ui.input_action_button("go_most_relevant_sources", "Most Relevant Sources", class_="sidebar-button", icon=ICONS["book_open"] if "book_open" in ICONS else ICONS["sources"]), - ui.input_action_button("go_most_local_cited_sources", "Most Local Cited Sources", class_="sidebar-button", icon=ICONS["book"] if "book" in ICONS else ICONS["sources"]), - ui.input_action_button("go_bradfords_law", "Bradford's Law", class_="sidebar-button", icon=ICONS["annual_growth_rate"]), - ui.input_action_button("go_sources_local_impact", "Sources' Local Impact", class_="sidebar-button", icon=ICONS["star"] if "star" in ICONS else ICONS["sources"]), - ui.input_action_button("go_sources_production_over_time", "Sources' Production over Time", class_="sidebar-button", icon=ICONS["calendar"] if "calendar" in ICONS else ICONS["timespan"]), - with ui.accordion_panel("Authors", icon=ICONS["authors_colored"]): - # Authors Section - ui.span("Authors", style="color: gray;") - ui.input_action_button("go_most_relevant_authors", "Most Relevant Authors", class_="sidebar-button", icon=ICONS["authors"]) - ui.input_action_button("go_most_local_cited_authors", "Most Local Cited Authors", class_="sidebar-button", icon=ICONS["authors_single_authored_docs"]) - ui.input_action_button("go_authors_production_over_time", "Authors' Production over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - ui.input_action_button("go_lotkas_law", "Lotka's Law", class_="sidebar-button", icon=ICONS["overview"]) - ui.input_action_button("go_authors_local_impact", "Authors' Local Impact", class_="sidebar-button", icon=ICONS["star"] if "star" in ICONS else ICONS["authors"]) - # Affiliations Section - ui.span("Affiliations", style="color: gray;") - ui.input_action_button("go_most_relevant_affiliations", "Most Relevant Affiliations", class_="sidebar-button", icon=ICONS["database"]) - ui.input_action_button("go_affiliations_production_over_time", "Affiliations' Production over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - # Countries Section - ui.span("Countries", style="color: gray;") - ui.input_action_button("go_corresponding_authors_countries", "Corresponding Author's Countries", class_="sidebar-button", icon=ICONS["international_co_authorship"]) - ui.input_action_button("go_countries_scientific_production", "Countries' Scientific Production", class_="sidebar-button", icon=ICONS["international_co_authorship"]) - ui.input_action_button("go_countries_production_over_time", "Countries' Production over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - ui.input_action_button("go_most_cited_countries", "Most Cited Countries", class_="sidebar-button", icon=ICONS["book"]) - with ui.accordion_panel("Documents", icon=ICONS["documents_colored"]): - # Documents Section - ui.span("Documents", style="color: gray;") - ui.input_action_button("go_most_global_cited_documents", "Most Global Cited Documents", class_="sidebar-button", icon=ICONS["documents"]) - ui.input_action_button("go_most_local_cited_documents", "Most Local Cited Documents", class_="sidebar-button", icon=ICONS["documents"]) - - # Cited References Section - ui.span("Cited References", style="color: gray;") - ui.input_action_button("go_most_local_cited_references", "Most Local Cited References", class_="sidebar-button", icon=ICONS["references"]) - ui.input_action_button("go_references_spectroscopy", "References Spectroscopy", class_="sidebar-button", icon=ICONS["references"]) - - # Words Section - ui.span("Words", style="color: gray;") - ui.input_action_button("go_most_frequent_words", "Most Frequent Words", class_="sidebar-button", icon=ICONS["authors_keywords_de"]) - ui.input_action_button("go_wordcloud", "WordCloud", class_="sidebar-button", icon=ICONS["authors_keywords_de"]) - ui.input_action_button("go_treemap", "TreeMap", class_="sidebar-button", icon=ICONS["overview"]) - ui.input_action_button("go_words_frequency_over_time", "Words' Frequency over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - ui.input_action_button("go_trend_topics", "Trend Topics", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - - with ui.accordion_panel("Clustering", icon=ICONS["clustering_colored"]): - ui.input_action_button("go_clustering", "Clustering", class_="sidebar-button", icon=ICONS["clustering"]) - - with ui.accordion_panel("Conceptual Structure", icon=ICONS["conceptual_structure_colored"]): - ui.span("Network Approach", style="color: gray;") - ui.input_action_button("go_cooccurrence_network", "Co-occurrence Network", class_="sidebar-button", icon=ICONS["clustering"]) - ui.input_action_button("go_thematic_map", "Thematic Map", class_="sidebar-button", icon=ICONS["overview"]) - ui.input_action_button("go_thematic_evolution", "Thematic Evolution", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - - ui.span("Factorial Approach", style="color: gray;") - ui.input_action_button("go_factorial_analysis", "Factorial Analysis", class_="sidebar-button", icon=ICONS["overview"]) - - with ui.accordion_panel("Intellectual Structure", icon=ICONS["intellectual_structure_colored"]): - ui.input_action_button("go_citation_network", "Citation Network", class_="sidebar-button", icon=ICONS["references"]) - ui.input_action_button("historiograph", "Historiograph", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) - - with ui.accordion_panel("Social Structure", icon=ICONS["social_structure_colored"]): - ui.input_action_button("go_collaboration_network", "Collaboration Network", class_="sidebar-button", icon=ICONS["co_authors_per_doc"]) - ui.input_action_button("go_countries_collaboration_network", "Countries Collaboration Network", class_="sidebar-button", icon=ICONS["international_co_authorship"]) - - with ui.accordion_panel("Report", icon=ICONS["report_colored"]): - ui.input_action_button("go_report", "Report", class_="sidebar-button", icon=ICONS["report"]) - with ui.accordion_panel("Settings", icon=ICONS["settings_colored"]): - ui.input_action_button("go_settings", "Settings", class_="sidebar-button", icon=ICONS["settings"]) - - # --- Footer --- - # Use static positioning and margin-top to avoid overlap with accordion content + data = df.get() + if data is not None and not data.empty: + ui.tags.script("setTimeout(function() { if(typeof setSidebarState === 'function') setSidebarState(true); }, 50);") + with ui.tags.div(id="sidebar_2", class_="custom-sidebar"): + with ui.accordion(id="sidebar_accordion_data", multiple=False, open=False): + # Info Section + with ui.accordion_panel("Biblioshiny", icon=ICONS["home_colored"]): + ui.input_action_button("go_about_2", "Biblioshiny", class_="sidebar-button", icon=ICONS["home"]) + # Data Section + with ui.accordion_panel("Data", icon=ICONS["database_colored"]): + ui.input_action_button("go_import_2", "Import or Load", class_="sidebar-button", icon=ICONS["data"]) + ui.input_action_button("go_api_2", "API", class_="sidebar-button", icon=ICONS["api"]) + ui.input_action_button("go_collections_2", "Merge Collection", class_="sidebar-button", icon=ICONS["merge"]) + + # Filters Section + with ui.accordion_panel("Filters", icon=ICONS["filters_colored"]): + ui.input_action_button("go_filters", "Filters", class_="sidebar-button", icon=ICONS["filters"]) + + # Analysis Section + with ui.accordion_panel("Overview", icon=ICONS["play_colored"]): + ui.input_action_button("go_main", "Main Information", class_="sidebar-button", icon=ICONS["overview"]) + ui.input_action_button("go_annual_scientific_production", "Annual Scientific Production", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + ui.input_action_button("go_average_citations_per_year", "Average Citations per Year", class_="sidebar-button", icon=ICONS["average_citations_per_doc"]) + ui.input_action_button("go_three_field_plot", "Three-Field Plot", class_="sidebar-button", icon=ICONS["overview"]) + with ui.accordion_panel("Sources", icon=ICONS["sources_colored"]): + ui.input_action_button("go_most_relevant_sources", "Most Relevant Sources", class_="sidebar-button", icon=ICONS["book_open"] if "book_open" in ICONS else ICONS["sources"]), + ui.input_action_button("go_most_local_cited_sources", "Most Local Cited Sources", class_="sidebar-button", icon=ICONS["book"] if "book" in ICONS else ICONS["sources"]), + ui.input_action_button("go_bradfords_law", "Bradford's Law", class_="sidebar-button", icon=ICONS["annual_growth_rate"]), + ui.input_action_button("go_sources_local_impact", "Sources' Local Impact", class_="sidebar-button", icon=ICONS["star"] if "star" in ICONS else ICONS["sources"]), + ui.input_action_button("go_sources_production_over_time", "Sources' Production over Time", class_="sidebar-button", icon=ICONS["calendar"] if "calendar" in ICONS else ICONS["timespan"]), + with ui.accordion_panel("Authors", icon=ICONS["authors_colored"]): + # Authors Section + ui.span("Authors", style="color: gray;") + ui.input_action_button("go_most_relevant_authors", "Most Relevant Authors", class_="sidebar-button", icon=ICONS["authors"]) + ui.input_action_button("go_most_local_cited_authors", "Most Local Cited Authors", class_="sidebar-button", icon=ICONS["authors_single_authored_docs"]) + ui.input_action_button("go_authors_production_over_time", "Authors' Production over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + ui.input_action_button("go_lotkas_law", "Lotka's Law", class_="sidebar-button", icon=ICONS["overview"]) + ui.input_action_button("go_authors_local_impact", "Authors' Local Impact", class_="sidebar-button", icon=ICONS["star"] if "star" in ICONS else ICONS["authors"]) + # Affiliations Section + ui.span("Affiliations", style="color: gray;") + ui.input_action_button("go_most_relevant_affiliations", "Most Relevant Affiliations", class_="sidebar-button", icon=ICONS["database"]) + ui.input_action_button("go_affiliations_production_over_time", "Affiliations' Production over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + # Countries Section + ui.span("Countries", style="color: gray;") + ui.input_action_button("go_corresponding_authors_countries", "Corresponding Author's Countries", class_="sidebar-button", icon=ICONS["international_co_authorship"]) + ui.input_action_button("go_countries_scientific_production", "Countries' Scientific Production", class_="sidebar-button", icon=ICONS["international_co_authorship"]) + ui.input_action_button("go_countries_production_over_time", "Countries' Production over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + ui.input_action_button("go_most_cited_countries", "Most Cited Countries", class_="sidebar-button", icon=ICONS["book"]) + with ui.accordion_panel("Documents", icon=ICONS["documents_colored"]): + # Documents Section + ui.span("Documents", style="color: gray;") + ui.input_action_button("go_most_global_cited_documents", "Most Global Cited Documents", class_="sidebar-button", icon=ICONS["documents"]) + ui.input_action_button("go_most_local_cited_documents", "Most Local Cited Documents", class_="sidebar-button", icon=ICONS["documents"]) + + # Cited References Section + ui.span("Cited References", style="color: gray;") + ui.input_action_button("go_most_local_cited_references", "Most Local Cited References", class_="sidebar-button", icon=ICONS["references"]) + ui.input_action_button("go_references_spectroscopy", "References Spectroscopy", class_="sidebar-button", icon=ICONS["references"]) + + # Words Section + ui.span("Words", style="color: gray;") + ui.input_action_button("go_most_frequent_words", "Most Frequent Words", class_="sidebar-button", icon=ICONS["authors_keywords_de"]) + ui.input_action_button("go_wordcloud", "WordCloud", class_="sidebar-button", icon=ICONS["authors_keywords_de"]) + ui.input_action_button("go_treemap", "TreeMap", class_="sidebar-button", icon=ICONS["overview"]) + ui.input_action_button("go_words_frequency_over_time", "Words' Frequency over Time", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + ui.input_action_button("go_trend_topics", "Trend Topics", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + + with ui.accordion_panel("Clustering", icon=ICONS["clustering_colored"]): + ui.input_action_button("go_clustering", "Clustering", class_="sidebar-button", icon=ICONS["clustering"]) + + with ui.accordion_panel("Conceptual Structure", icon=ICONS["conceptual_structure_colored"]): + ui.span("Network Approach", style="color: gray;") + ui.input_action_button("go_cooccurrence_network", "Co-occurrence Network", class_="sidebar-button", icon=ICONS["clustering"]) + ui.input_action_button("go_thematic_map", "Thematic Map", class_="sidebar-button", icon=ICONS["overview"]) + ui.input_action_button("go_thematic_evolution", "Thematic Evolution", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + + ui.span("Factorial Approach", style="color: gray;") + ui.input_action_button("go_factorial_analysis", "Factorial Analysis", class_="sidebar-button", icon=ICONS["overview"]) + + with ui.accordion_panel("Intellectual Structure", icon=ICONS["intellectual_structure_colored"]): + ui.input_action_button("go_citation_network", "Citation Network", class_="sidebar-button", icon=ICONS["references"]) + ui.input_action_button("historiograph", "Historiograph", class_="sidebar-button", icon=ICONS["annual_growth_rate"]) + + with ui.accordion_panel("Social Structure", icon=ICONS["social_structure_colored"]): + ui.input_action_button("go_collaboration_network", "Collaboration Network", class_="sidebar-button", icon=ICONS["co_authors_per_doc"]) + ui.input_action_button("go_countries_collaboration_network", "Countries Collaboration Network", class_="sidebar-button", icon=ICONS["international_co_authorship"]) + + with ui.accordion_panel("Report", icon=ICONS["report_colored"]): + ui.input_action_button("go_report", "Report", class_="sidebar-button", icon=ICONS["report"]) + with ui.accordion_panel("Settings", icon=ICONS["settings_colored"]): + ui.input_action_button("go_settings", "Settings", class_="sidebar-button", icon=ICONS["settings"]) + + # --- Footer --- + # Use static positioning and margin-top to avoid overlap with accordion content with ui.tags.footer( class_="custom-footer", style=( @@ -8344,9 +8392,9 @@ def toggle_sidebar(): }); observer.observe(document.body, { childList: true, subtree: true }); - // Show both sidebars when 'start_button' is clicked + // Show both sidebars when 'start_button' or 'btn_fetch_api' is clicked document.addEventListener("click", function(e) { - if (e.target && e.target.id === "start_button") { + if (e.target && (e.target.id === "start_button" || e.target.id === "btn_fetch_api")) { setSidebarState(true); } }); diff --git a/execution_log.txt b/execution_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbfcb0a5c9634b2e1c780e0339ea35b9d4558b47 GIT binary patch literal 1762 zcmchYPmdBo5XI|k;&&KNT$DKIUQ9dy17O*^AZ*7NdV}}y?{M6jmc)Z|T!&a8s6m;=WG$E&0^YZ*H(yimO#yf+{ z2F(=q4QslkxFh0DS<0%gIlCq60HT=cjN)wB&CsRb>sJ-C9w{FA9JQ*@2F;w6DkzsfxTvzl>2ebHln`4@2aLB*oRV| z)Nrqx&R^LhSL0AqO@=Dk$4hglu14(C!`$r({-)>>EUI{mUg?v3iBDuV zoZAy2Fnxu*riVgYV!vXyaGvk!YK0LyS=Jr#a-zuFGwpDCfo2c3Cc@joEcKW(-5gD) z<}`P+rPF;hLh{@Lqc&Lb(;SW*?#i7^YiwFh9^zH-X-0KdB8TTQ)kJ%qy36a5aR>f| z^Yb^Nn)yH3_|9mqpSX4YHnfkZ2QpGOeBON`D5DU!fR{%QR!mhRUKZWgiOW65UneJ2 zX#>+4`|9+4H~Ep9o7M@M?xZGY?{Gc-jyMa~72Q&;^H;%RJv+gg!zHt?aPb|R@ZMFo zJ>bL+O>gFgp808A@hbVgh6?z;Y&oxY?2h_x`J&@gL}a`sW4#+bme=@qPe>6H+~o61 zuakd!^i~!3=JWg{pTK?!1I_RZYsljN@sI6-Z2pb^#R-4iE>$^ovooPy`tOtS`RV>! GcYgz$a}t^W literal 0 HcmV?d00001 diff --git a/functions/get_data.py b/functions/get_data.py index 16baed992..a1cb761db 100644 --- a/functions/get_data.py +++ b/functions/get_data.py @@ -42,21 +42,24 @@ def get_data(input, database, df, reset_callback=None): else: # Process single file (original logic) type = file[0]["name"] - json = biblio_json(file[0]["datapath"], source, type, author) - df.set(pd.read_json(StringIO(json))) + + # Base Level: Bypass the fragile legacy 'biblio_json' parser and directly use our ETL Pipeline! + clean_df = ETLPipeline.convert2df(source_data=file[0]["datapath"], source_type=source, is_api=False, original_filename=type) + df.set(clean_df) + # Reset all analysis results when new dataset is loaded if reset_callback: reset_callback() if type.endswith(".zip"): text = ui.p( - f"{database}'s ZIP archive uploaded and extracted successfully! " + f"{database}'s ZIP archive uploaded, extracted, and Standardized successfully! " f"Multiple files have been processed and combined. " f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." ) else: text = ui.p( - f"{database}'s file uploaded successfully! You can now proceed to analyze your data. " + f"{database}'s file uploaded and Standardized successfully! You can now proceed to analyze your data. " f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns." ) except Exception as e: diff --git a/generate_notebook.py b/generate_notebook.py new file mode 100644 index 000000000..04f19d98e --- /dev/null +++ b/generate_notebook.py @@ -0,0 +1,91 @@ +import json + +notebook = { + 'cells': [ + { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': [ + '# ETL Pipeline Execution Evidence\n', + 'This notebook demonstrates the execution of the custom ETL pipeline retrieving data from the OpenAlex API, standardizing it, validating it, and preparing it for the Bibliometrix dashboard.' + ] + }, + { + 'cell_type': 'code', + 'execution_count': None, + 'metadata': {}, + 'outputs': [], + 'source': [ + 'import sys\n', + 'import os\n', + 'import pandas as pd\n', + '# Ensure local modules can be imported\n', + 'sys.path.append(os.path.abspath("."))\n', + '\n', + 'from www.services.etl import ETLPipeline' + ] + }, + { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': [ + '## Live Query Execution via API' + ] + }, + { + 'cell_type': 'code', + 'execution_count': None, + 'metadata': {}, + 'outputs': [], + 'source': [ + 'query = "machine learning"\n', + 'print(f"Executing live API query to OpenAlex for: {query}...")\n', + 'df_standardized = ETLPipeline.convert2df(source_data="API", source_type="OpenAlex", is_api=True, query=query)\n', + 'print(f"\\nSuccessfully retrieved and standardized {len(df_standardized)} records.")' + ] + }, + { + 'cell_type': 'markdown', + 'metadata': {}, + 'source': [ + '## Validation and Normalized Output\n', + 'Displaying the first 5 normalized rows demonstrating standard Web of Science columns (e.g., UT, TI, CR, PY).' + ] + }, + { + 'cell_type': 'code', + 'execution_count': None, + 'metadata': {}, + 'outputs': [], + 'source': [ + 'pd.set_option("display.max_columns", None)\n', + 'df_standardized.head(5)' + ] + } + ], + 'metadata': { + 'kernelspec': { + 'display_name': 'Python 3', + 'language': 'python', + 'name': 'python3' + }, + 'language_info': { + 'codemirror_mode': { + 'name': 'ipython', + 'version': 3 + }, + 'file_extension': '.py', + 'mimetype': 'text/x-python', + 'name': 'python', + 'nbconvert_exporter': 'python', + 'pygments_lexer': 'ipython3', + 'version': '3.12.0' + } + }, + 'nbformat': 4, + 'nbformat_minor': 4 +} + +with open('ETL_Execution_Evidence.ipynb', 'w') as f: + json.dump(notebook, f, indent=1) +print('Notebook created successfully.') diff --git a/run.bat b/run.bat new file mode 100644 index 000000000..ce6be75aa --- /dev/null +++ b/run.bat @@ -0,0 +1,5 @@ +@echo off +echo Starting Biblioshiny Dashboard... +echo Your web browser will open automatically. +python -m shiny run --launch-browser app.py +pause diff --git a/standardized_api_data.csv b/standardized_api_data.csv new file mode 100644 index 000000000..8e0c38a96 --- /dev/null +++ b/standardized_api_data.csv @@ -0,0 +1,101 @@ +DB,UT,DI,PMID,TI,SO,JI,PY,DT,LA,TC,AU,AF,C1,RP,CR,DE,ID,AB,VL,IS,BP,EP,SR +PUBMED,,,42221614,The Epi training kit pilot: an inclusive Spanish-language e-learning approach to epidemiology and data science in Latin America and the Caribbean.,Front Public Health,,2026,Journal Article,eng,0,GĂłmez-Bermeo L;Velasco-España JM;Buitrago-LĂłpez A;GĂłmez-MillĂĄn G;CucunubĂĄ ZM,,,,,,,,14,,1745984,,"L GĂłmez-Bermeo, 2026, Front Public Health" +PUBMED,,,42122050,Integrative Multidimensional Machine Learning Models for Stroke Prognosis: Age-Stratified and History Engineered Perspectives.,Diagnostics (Basel),,2026,Journal Article,eng,0,Lee G;Kwon S;Shin SH;Kim C;Yu JY,,,,,,,,16,9,,,"G Lee, 2026, Diagnostics (Basel)" +PUBMED,,,42057274,A Generalized Outlier Method with an Automated Test Protocol Applied to Proficiency Testing Schemes: Utilizing Monte Carlo Simulation for Generating Critical Values for the Extended Grubbs Statistic.,J AOAC Int,,2026,Journal Article,eng,0,Nilsson T;Langer Sigaard S,,,,,,,,,,,,"T Nilsson, 2026, J AOAC Int" +PUBMED,,,42038418,"Sparse regression, classification, and microbial network estimation in QIIME 2 with q2-classo and q2-gglasso.",ArXiv,,2026,Journal Article,eng,0,Vlasovets O;Schaipp F;Simpson L;Bolyen E;Caporaso JG;MĂŒller CL,,,,,,,,,,,,"O Vlasovets, 2026, ArXiv" +PUBMED,,,41944570,Score Matching for Differential Abundance Testing of Compositional High-Throughput Sequencing Data.,Stat Med,,2026,Journal Article,eng,0,Ostner J;Li H;MĂŒller CL,,,,,,,,45,8-9,e70534,,"J Ostner, 2026, Stat Med" +PUBMED,,,41883750,Generative AI for climate governance and acceptability-constrained policy design.,NPJ Clim Action,,2026,Journal Article,eng,0,Manivannan A;Spaiser V;Cann TJB;Evans J;Everall JP;Falkenberg M;Garcia D;Guo W;Herzog R;Otto IM;Oswald Y;Pagan N;Pellert M;Pilgrim C;Rodriguez-Pardo C;Sen I;Vezhnevets AS,,,,,,,,5,1,37,,"A Manivannan, 2026, NPJ Clim Action" +PUBMED,,,41880979,Evaluating quality of care and patient safety with ICD-11: Opportunities for the French National Health Data System (SNDS).,J Epidemiol Popul Health,,2026,Journal Article,eng,0,Boussat B;Boyer L;Quantin C;Southern DA;Ghali WA;GuĂ©ant S;Danjou F;Mokaddem Y;Mercier G;Duclos A;“REDSIAM Quality Group”,,,,,,,,74,2,203372,,"B Boussat, 2026, J Epidemiol Popul Health" +PUBMED,,,41790126,Stability trends of near- and equiatomic (n ⋍ m) Co(n)Mo(m) and Mo(n)Co(m) (n + m = 2-15) subnanoalloys from DFT and K-means clustering.,Phys Chem Chem Phys,,2026,Journal Article,eng,0,de AraĂșjo OGS;Alves AS;Dos Santos Costa M;Andriani KF,,,,,,,,28,12,7494-7503,,"OGS de AraĂșjo, 2026, Phys Chem Chem Phys" +PUBMED,,,41760682,FAIR m-BIDS: Advancing brain data utilization through multimodal and FAIR principles.,Sci Data,,2026,Journal Article,eng,0,Mirhosseini SM;Naseri H;Siahlou B;Panahi Arasi M;Monazami Eslami S;Safaei AA,,,,,,,,13,1,,,"SM Mirhosseini, 2026, Sci Data" +PUBMED,,,41758512,"Clinical Pharmacists, Medications, and Contingency Management for Targeting Smoking in HIV Clinics: A Randomized Clinical Trial.",JAMA Netw Open,,2026,Journal Article,eng,0,Edelman EJ;Deng Y;Dziura J;Nahum-Shani I;Weiss JM;Aoun-Barakat L;Bold KW;Harsono D;Mistler C;Payne E;Aiudi S;Sigel KM;Yager JE;Ledgerwood DM;Bernstein SL,,,,,,,,9,2,e2560593,,"EJ Edelman, 2026, JAMA Netw Open" +PUBMED,,,41700319,Accelerating Catalyst Materials Discovery With Large Artificial Intelligence Models.,Angew Chem Int Ed Engl,,2026,Journal Article,eng,0,Zhang D;Chen Y;Liu C;Liu Y;Xin H;Peng J;Ou P;Li H,,,,,,,,65,16,e26150,,"D Zhang, 2026, Angew Chem Int Ed Engl" +PUBMED,,,41659587,Epithelial Reprogramming and Transition during Pulmonary Bioengineering.,bioRxiv,,2026,Journal Article,eng,0,Mizoguchi S;Lee V;Kim H;Edelstein SE;Wang N;Gracia MT;Danelski C;Haynes C;Rivero R;Stitelman D;Obata T;Greaney AM;Tsuchiya T;Kyriakides TR;Kaminski N;Raredon MSB,,,,,,,,,,,,"S Mizoguchi, 2026, bioRxiv" +PUBMED,,,41572614,Extension of Bootstrap MARS With Group LASSO for Heterogeneous Treatment Effect Estimation.,Stat Med,,2026,Journal Article,eng,0,He G;Wan K;Shimokawa T;Maruo K,,,,,,,,45,1-2,e70370,,"G He, 2026, Stat Med" +PUBMED,,,41526603,Enhancing healthcare workers' safety and well-being through a comprehensive qualitative analysis across hospital settings.,Sci Rep,,2026,Journal Article,eng,0,Foglia E;Ferrario L;Garagiola E,,,,,,,,16,1,5084,,"E Foglia, 2026, Sci Rep" +PUBMED,,,41428500,ARKbase: Antimicrobial Resistance Knowledgebase1.0.,Nucleic Acids Res,,2026,Journal Article,eng,0,Gambhir S;Pandey S;Bajetha H;Kaur J;Das A;Pranavathiyani G;Aggarwal R;Maity U;Dange S;Singh V;Zarkar M;Sankhdher R;Singh B;Seth S;Bhardwaj A,,,,,,,,54,D1,D703-D714,,"S Gambhir, 2026, Nucleic Acids Res" +PUBMED,,,41315831,Ad hoc bandwidth requests and power conservation in 5G wireless networks with tiny cells.,Sci Rep,,2025,Journal Article,eng,0,Rajesh A;Ravikumar CV;Sulthana SF;Kim TH;Shankar T;Srinivasulu A;Altameem T,,,,,,,,15,1,42792,,"A Rajesh, 2025, Sci Rep" +PUBMED,,,41287046,Nipah Virus Inhibitor Knowledgebase (NVIK): a combined evidence approach to prioritise small molecule inhibitors.,J Cheminform,,2025,Journal Article,eng,0,Singh B;Kumari N;Upadhyay A;Pahuja B;Covernton E;Kalia K;Tuteja K;Paul PR;Kumar R;Zarkar MS;Bhardwaj A,,,,,,,,17,1,174,,"B Singh, 2025, J Cheminform" +PUBMED,,,41274086,Polyp image segmentation based on parallel dilated convolution and dual attention mechanisms.,Neural Netw,,2026,Journal Article,eng,0,Chen S;Chen K;Wang C;Zhou Z;Wen S;Zhu T;Wu M,,,,,,,,195,,108282,,"S Chen, 2026, Neural Netw" +PUBMED,,,41241336,"Neuroprotection by post-stroke administration of the slow-releasing hydrogen sulfide (H(2)S) donor AP39: Long-Term functional, MRI, and molecular evidence in a rodent stroke model.",Eur J Pharmacol,,2025,Journal Article,eng,0,Bartosz P;Weronika K;Alicja S;Jakub J;Katarzyna PP;MaƂgorzata S;Monika M;Zuzanna G;Patrycja R;Eugene K;Michel M;Roberta T;Matthew W;Lucyna PC;Joanna P;BogusƂawa B,,,,,,,,1008,,178331,,"P Bartosz, 2025, Eur J Pharmacol" +PUBMED,,,41230453,State-of-Art in Studying the Public Health Effects of Heat: A Literature Review.,Glob Chall,,2025,Journal Article,eng,0,Gianquintieri L;Caiani EG,,,,,,,,9,11,e00381,,"L Gianquintieri, 2025, Glob Chall" +PUBMED,,,41219714,Study protocol for a randomized controlled trial assessing clinical efficacy of digital cognitive rehabilitation for preclinical and mild clinical stages of alzheimer's disease continuum: the MI-RICORDO project.,BMC Psychiatry,,2025,Journal Article,eng,0,Blasi V;Isernia S;Rossetto F;Pagliari C;Borgnis F;Pirastru A;Marzulli M;Foglia E;Garagiola E;Baglio F,,,,,,,,25,1,1075,,"V Blasi, 2025, BMC Psychiatry" +PUBMED,,,41116350,Distribution of singular values in large sample cross-covariance matrices.,Phys Rev E,,2025,Journal Article,eng,0,Swain A;Ridout SA;Nemenman I,,,,,,,,112,3-2,035312,,"A Swain, 2025, Phys Rev E" +PUBMED,,,41092974,CAT-GxD: Centralized access to gene expression datasets.,Anaerobe,,2025,Journal Article,eng,0,Roxas BAP;Roxas JL;Guo JS;LeBauer DS;McCarthy F;Vedantam G;Viswanathan VK,,,,,,,,96,,103005,,"BAP Roxas, 2025, Anaerobe" +PUBMED,,,41049017,Using Choice and Utility Value to Promote Interest: Stimulating Situational Interest in a Lesson and Fostering the Development of Interest in Statistics.,J Educ Psychol,,2025,Journal Article,eng,0,Asher MW;Harackiewicz JM,,,,,,,,117,4,647-662,,"MW Asher, 2025, J Educ Psychol" +PUBMED,,,41038717,Reassessing data management in increasingly complex phenotypic datasets.,Trends Plant Sci,,2026,Journal Article,eng,0,Pommier C;Alic I;Cabrera-Bosquet L;Draye X;Neveu P;Reif JC;Robbins KR;Krajewski P;Tardieu F,,,,,,,,31,5,543-554,,"C Pommier, 2026, Trends Plant Sci" +PUBMED,,,41036253,"Privacy-, linguistic-, and information-preserving synthesis of clinical documentation through generative agents.",Front Artif Intell,,2025,Journal Article,eng,0,van Velzen M;van der Willigen RF;de Beer VJ;de Graaf-Waar HI;Janssen ERC;van Leeuwen S;van der Willigen MF;van der Willigen MJ;Renardus G;El Maaroufi R;Satimin SJ;Hartog LM;Hulsen T;van Meeteren NLU;Scheper MC,,,,,,,,8,,1644084,,"M van Velzen, 2025, Front Artif Intell" +PUBMED,,,41033015,Does BMI influence AI and human reader lung nodule detection in low-dose chest CT?,Eur J Radiol,,2025,Journal Article,eng,0,Sourlos N;van Tuinen M;Sidorenkov G;de Jonge G;Schalekamp S;Pelgrim GJ;Greuter M;Rook M;Prokop M;van Ooijen P;Vliegenthart R,,,,,,,,193,,112453,,"N Sourlos, 2025, Eur J Radiol" +PUBMED,,,41023760,"Sacroiliac joint involvement in psoriatic arthritis - MRI, radiographic and clinical findings in 581 European routine care patients.",Arthritis Res Ther,,2025,Journal Article,eng,0,Vladimirova N;Hadsbjerg AE;Krabbe S;Ciurea A;BubovĂĄ K;GregovĂĄ M;Nissen MJ;Möller B;Micheroli R;Pedersen SJ;ZĂĄvada J;Snoj Z;Pintaric K;Gudbjornsson B;Rotar Z;Eshed I;Sudol-Szopinska I;Gosvig K;Diekhoff T;Lambert RG;de Hooge M;Donzallaz M;Bernatschek A;Hetland ML;Ørnbjerg LM;Østergaard M,,,,,,,,27,1,185,,"N Vladimirova, 2025, Arthritis Res Ther" +PUBMED,,,41015781,Early Prediction and Risk Analysis Using Hybrid Deep Learning Techniques in Multimodal Biomedical Image.,Dev Neurobiol,,2025,Journal Article,eng,0,Vylala A;Plakkottu Radhakrishnan B;Balakrishnan Kadan A,,,,,,,,85,4,e23001,,"A Vylala, 2025, Dev Neurobiol" +PUBMED,,,40656660,Growing Data science Research in Africa to Stimulate Progress (GRASP) program: Rationale and overview.,Equity Neurosci,,2025,Journal Article,eng,0,Uvere E;Kumuthini J;Fatumo S;Taiwo J;Akinyemi R;Ogunniyi A;Ogbole G;Aribisala B;Sarfo F;Jegede A;Akinyemi J;Vedanthan R;Okekunle A;Lackland D;Ovbiagele B;Owolabi M,,,,,,,,1,1,,,"E Uvere, 2025, Equity Neurosci" +PUBMED,,,40648628,The Impact of Automation and Digitalization in Hospital Medication Management: Economic Analysis in the European Countries.,Healthcare (Basel),,2025,Journal Article,eng,0,Orsini FF;Bellavia D;Schettini F;Foglia E,,,,,,,,13,13,,,"FF Orsini, 2025, Healthcare (Basel)" +PUBMED,,,40493067,Changes in numbers needed to treat and hospital care expenditures of optimized indications for primary prevention implantable cardioverter defibrillators: a scenario analysis.,Clin Res Cardiol,,2026,Journal Article,eng,0,van Barreveld M;van Dessel PFHM;Buskens E;Boersma LVA;Delnoy PPHM;Tuinenburg AE;Theuns DAMJ;van der Voort PH;Kimman GP;Verstraelen TE;Zwinderman AH;Wilde AAM;Dijkgraaf MGW,,,,,,,,115,4,576-589,,"M van Barreveld, 2026, Clin Res Cardiol" +PUBMED,,,40475002,Cost-effectiveness of tuberculosis infection screening at first reception into English prisons: a model-based analysis.,EClinicalMedicine,,2025,Journal Article,eng,0,Mafirakureva N;Hunter R;Ferraro CF;Willner S;Finnie T;Hayward A;Lee A;Roy A;Edge C;Dodd PJ,,,,,,,,83,,103245,,"N Mafirakureva, 2025, EClinicalMedicine" +PUBMED,,,40380711,Controlled Intervention Study on Effects of an AI-Based App to Support Wound Care: First Results.,Stud Health Technol Inform,,2025,Journal Article,eng,0,Pinnekamp H;Priester V;Brehmer A;Fischer U,,,,,,,,327,,1295-1296,,"H Pinnekamp, 2025, Stud Health Technol Inform" +PUBMED,,,40251825,Comparison of the effectiveness of visual acuity measurements for amblyopia screening at the age of 36 and 45 months and difference in severity of amblyopia detected.,Acta Ophthalmol,,2025,Journal Article,eng,0,Telleman MAJ;Sloot F;Benjamins SJ;Loudon SE;Spek B;Simonsz HJ;Orthoptic Research Group,,,,,,,,103,7,799-811,,"MAJ Telleman, 2025, Acta Ophthalmol" +PUBMED,,,40027177,Ottimizzazione nel trattamento del soggetto con HIV: analisi di impatto economico e organizzativo di Bictegravir/Emtricitabina/Tenofovir Alafenamide.,Glob Reg Health Technol Assess,,2025,Journal Article,ita,0,Ferrario L;Menzaghi B;Rizzardini G;Roccia A;Garagiola E;Bellavia D;Schettini F;Foglia E,,,,,,,,12,,49-60,,"L Ferrario, 2025, Glob Reg Health Technol Assess" +PUBMED,,,40023406,Cytotoxic T-lymphocyte associated protein 4 inhibitors are associated with a higher risk of cardiovascular events than programmed cell death protein 1 inhibitors in patients with melanoma.,J Am Acad Dermatol,,2025,Journal Article,eng,0,Chang CC;Lo SW;Chang HC;Song J;Chang YC;Yang K;Chi KY;Chang Y;Chiang CH;Chiang CH,,,,,,,,93,1,202-204,,"CC Chang, 2025, J Am Acad Dermatol" +PUBMED,,,39951916,Management of patients with active relapsing-remitting or secondary progressive multiple sclerosis: A French real-world study based on claims data linked to a phase IV study.,Mult Scler Relat Disord,,2025,Journal Article,eng,0,Moisset X;Mercier G;Belhassen M;Deygas F;Civet A;Pau D;Rolland L;Bourel G;Larrieu S;Marchal C,,,,,,,,95,,106305,,"X Moisset, 2025, Mult Scler Relat Disord" +PUBMED,,,39853521,Correction: From Clinical to Non-clinical Outcomes in the Treatment of HIV: An Economic and Organizational Impact Assessment.,Pharmacoecon Open,,2025,Published Erratum,eng,0,Ferrario L;Menzaghi B;Rizzardini G;Roccia A;Garagiola E;Bellavia D;Schettini F;Foglia E,,,,,,,,9,2,327,,"L Ferrario, 2025, Pharmacoecon Open" +PUBMED,,,39779279,Deciphering risk factors for severe postherpetic neuralgia in patients with herpes zoster: an interpretable machine learning approach.,Reg Anesth Pain Med,,2026,Journal Article,eng,0,Park SJ;Han J;Choi JB;Min SK;Park J;Choi S,,,,,,,,51,4,429-436,,"SJ Park, 2026, Reg Anesth Pain Med" +PUBMED,,,39734790,Using Data-Science Approaches to Unravel Insights for Enhanced Transport of Lithium Ions in Single-Ion Conducting Polymer Electrolytes.,Chem Mater,,2024,Journal Article,eng,0,Zhu Q;Liu Y;Shepard LB;Bhattacharya D;Sinnott SB;Reinhart WF;Cooper VR;Kumar R,,,,,,,,36,24,11934-11946,,"Q Zhu, 2024, Chem Mater" +PUBMED,,,39713439,Score matching for differential abundance testing of compositional high-throughput sequencing data.,bioRxiv,,2024,Journal Article,eng,0,Ostner J;Li H;MĂŒller CL,,,,,,,,,,,,"J Ostner, 2024, bioRxiv" +PUBMED,,,39709670,Glucagon-like Peptide-1 Agonists Reduce Cardiovascular Events in Cancer Patients on Immune Checkpoint Inhibitors.,Eur J Cancer,,2025,Journal Article,eng,0,Chiang CH;Song J;Chi KY;Chang YC;Xanthavanij N;Chang Y;Hsia YP;Chiang CH;Ghamari A;Reynolds KL;Lin S;Xu XH;Neilan TG,,,,,,,,216,,115170,,"CH Chiang, 2025, Eur J Cancer" +PUBMED,,,39632680,Emergent properties of the lysine methylome reveal regulatory roles via protein interactions and histone mimicry.,Epigenomics,,2025,Journal Article,eng,0,Pollin G;Chi YI;Mathison AJ;Zimmermann MT;Lomberk G;Urrutia R,,,,,,,,17,1,5-20,,"G Pollin, 2025, Epigenomics" +PUBMED,,,39532817,From Clinical to Non-clinical Outcomes in the Treatment of HIV: An Economic and Organizational Impact Assessment.,Pharmacoecon Open,,2025,Journal Article,eng,0,Ferrario L;Menzaghi B;Rizzardini G;Roccia A;Garagiola E;Bellavia D;Schettini F;Foglia E,,,,,,,,9,2,313-326,,"L Ferrario, 2025, Pharmacoecon Open" +PUBMED,,,39528391,Using 'Situation-Background-Assessment-Recommendation' Method in Palliative Care to Enhance Handover Quality and Nursing Practice: A Mix Method Study.,J Clin Nurs,,2025,Journal Article,eng,0,Pinto F;Roberto P;Ferrario L;Marotta L;Montani D;Auletta G;Zoppini L;Foglia E,,,,,,,,34,1,117-127,,"F Pinto, 2025, J Clin Nurs" +PUBMED,,,39526992,Performance evaluation of the introduction of full sample traceability system within the specimen collection process.,Clin Chem Lab Med,,2025,Journal Article,eng,0,Foglia E;Garagiola E;Ferrario L;Plebani M,,,,,,,,63,4,723-733,,"E Foglia, 2025, Clin Chem Lab Med" +PUBMED,,,39518081,From Real-World Data to Causally Interpretable Models: A Bayesian Network to Predict Cardiovascular Diseases in Adolescents and Young Adults with Breast Cancer.,Cancers (Basel),,2024,Journal Article,eng,0,Bernasconi A;Zanga A;Lucas PJF;Scutari M;Di Cosimo S;De Santis MC;La Rocca E;Baili P;Cavallo I;Verderio P;Ciniselli CM;Pizzamiglio S;Blanda A;Perego P;Vallerio P;Stella F;Trama A;The Ada Working Group,,,,,,,,16,21,,,"A Bernasconi, 2024, Cancers (Basel)" +PUBMED,,,39450759,Effectiveness of Omega-3 Fatty Acids Versus Placebo in Subjects at Ultra-High Risk for Psychosis: The PURPOSE Randomized Clinical Trial.,Schizophr Bull,,2025,Journal Article,eng,0,Winter-van Rossum I;Slot MIE;van Hell HH;Bossong MG;Berger G;Aschauer H;Maat A;Walitza S;Lavan O;Baeza I;Dolz M;Monducci E;Fiori Nastro P;Kroken RA;Lawrie SM;DĂ­az-Caneja CM;Renner T;Schlögelhofer M;Scharinger C;Spalletta G;Banaj N;Otero S;Schipper M;Kwakkel DB;PURPOSE Study Group;Kahn RS,,,,,,,,51,4,1082-1091,,"I Winter-van Rossum, 2025, Schizophr Bull" +PUBMED,,,39435882,Elucidation of DPP-4 involvement in systemic distribution and renal reabsorption of linagliptin by PBPK modeling with a cluster Gauss-Newton method.,Clin Transl Sci,,2024,Journal Article,eng,0,Nakamura R;Yoshikado T;Aoki Y;Sugiyama Y;Chiba K,,,,,,,,17,10,e70047,,"R Nakamura, 2024, Clin Transl Sci" +PUBMED,,,39401960,Exploration of the relationship between general health-related problems and subclinical coronary artery disease: a cross-sectional study in a general population.,BMJ Open,,2024,Journal Article,eng,0,Koopman MY;van der Ende MY;Reijnders JJW;Willemsen RTA;van Bruggen R;Gratama JWC;Kietselaer BLJH;van der Harst P;Vliegenthart R,,,,,,,,14,10,e079835,,"MY Koopman, 2024, BMJ Open" +PUBMED,,,39400639,Recommendations for the creation of benchmark datasets for reproducible artificial intelligence in radiology.,Insights Imaging,,2024,Journal Article,eng,0,Sourlos N;Vliegenthart R;Santinha J;Klontzas ME;Cuocolo R;Huisman M;van Ooijen P,,,,,,,,15,1,248,,"N Sourlos, 2024, Insights Imaging" +PUBMED,,,39395985,Cross-cultural adaptation and validation of the Hospital Survey on Patient Safety questionnaire for a Chilean hospital.,BMC Nurs,,2024,Journal Article,eng,0,Hurtado-Arenas P;Guevara MR;GonzĂĄlez-ChordĂĄ VM,,,,,,,,23,1,748,,"P Hurtado-Arenas, 2024, BMC Nurs" +PUBMED,,,39332590,The incidence and risk of cardiovascular events associated with pembrolizumab in patients with breast cancer.,Cancer Lett,,2024,Journal Article,eng,0,Chiang CH;Xu XH;Song J;Xanthavanij N;Chi KY;Chang YC;Chang Y;Hsiao CL;Hsia YP;Chiang CH;Lin S,,,,,,,,611,,217277,,"CH Chiang, 2024, Cancer Lett" +PUBMED,,,39319287,Automated Drugs Dispensing Systems in Hospitals: a Health Technology Assessment (HTA) Study Across Six European Countries.,Clinicoecon Outcomes Res,,2024,Journal Article,eng,0,Foglia E;Asperti F;Antonacci G;Jani YH;Garagiola E;Bellavia D;Ferrario L,,,,,,,,16,,679-696,,"E Foglia, 2024, Clinicoecon Outcomes Res" +PUBMED,,,39232868,"Association between the antibiotics use and recurrence in patients with resected colorectal cancer: EVADER-1, a nation-wide pharmaco-epidemiologic study.",Dig Liver Dis,,2025,Journal Article,eng,0,Hilmi M;Khati I;Turpin A;Andremont A;Burdet C;Grall N;Vidal J;Bousquet PJ;Rousseau B;Bihan-Benjamin CL,,,,,,,,57,1,89-96,,"M Hilmi, 2025, Dig Liver Dis" +PUBMED,,,39168785,Clinical performance of Bladder EpiCheckℱ versus voided urine cytology for detecting recurrence of nonmuscle invasive bladder cancer: Systematic review and meta-analysis.,Urol Oncol,,2024,Journal Article,eng,0,Chiang CH;Chang YC;Peng CY;Wang SS;Jaroenlapnopparat A;Wang JCH;Jou CL;Tang PU;Hsia YP;Chiang CH;Chiang CH,,,,,,,,42,12,449.e21-449.e28,,"CH Chiang, 2024, Urol Oncol" +PUBMED,,,39150589,Correction: Effect of emphysema on AI software and human reader performance in lung nodule detection from low-dose chest CT.,Eur Radiol Exp,,2024,Published Erratum,eng,0,Sourlos N;Pelgrim G;Wisselink HJ;Yang X;de Jonge G;Rook M;Prokop M;Sidorenkov G;van Tuinen M;Vliegenthart R;van Ooijen PMA,,,,,,,,8,1,94,,"N Sourlos, 2024, Eur Radiol Exp" +PUBMED,,,39145403,Metadata for Data dIscoverability aNd Study rEplicability in obseRVAtional Studies (MINERVA): Lessons Learnt From the MINERVA Project in Europe.,Pharmacoepidemiol Drug Saf,,2024,Journal Article,eng,0,Gini R;Pajouheshnia R;Gutierrez L;Swertz MA;Hyde E;Sturkenboom M;Arana A;Franzoni C;Ehrenstein V;Roberto G;Gil M;MaciĂĄ MA;SchĂ€fer W;Haug U;Thurin NH;Lassalle R;Droz-Perroteau C;Zaccagnino S;Busto MP;Middelkoop B;Gembert K;Sanchez-Saez F;Rodriguez-Bernal C;SanfĂ©lix-Gimeno G;Hurtado I;Acosta MB;Poblador-Plou B;Carmona-PĂ­rez J;Gimeno-Miguel A;Prados-Torres A;Schultze A;Jansen E;Herings R;Kuiper J;Locatelli I;Jazbar J;Ćœerovnik Ć ;Kos M;Smit S;Lind S;Metspalu A;Simou S;Hedenmalm K;Cochino A;Alcini P;Kurz X;Perez-Gutthann S,,,,,,,,33,8,e5884,,"R Gini, 2024, Pharmacoepidemiol Drug Saf" +PUBMED,,,39058407,Accessing a Diverse Set of Functional Red-Light Photoswitches by Selective Copper-Catalyzed Indigo N-Arylation.,J Am Chem Soc,,2024,Journal Article,eng,0,Jaiswal AK;Saha P;Jiang J;Suzuki K;Jasny A;Schmidt BM;Maeda S;Hecht S;Huang CD,,,,,,,,146,31,21367-21376,,"AK Jaiswal, 2024, J Am Chem Soc" +PUBMED,,,39043728,Author Correction: IRE1α-XBP1s pathway promotes prostate cancer by activating c-MYC signaling.,Nat Commun,,2024,Published Erratum,eng,0,Sheng X;Nenseth HZ;Qu S;Kuzu OF;Frahnow T;Simon L;Greene S;Zeng Q;Fazli L;Rennie PS;Mills IG;Danielsen H;Theis F;Patterson JB;Jin Y;Saatcioglu F,,,,,,,,15,1,6190,,"X Sheng, 2024, Nat Commun" +PUBMED,,,39018014,The graded multidimensional geometry of phenotypic variation and progression in neurodegenerative syndromes.,Brain,,2025,Journal Article,eng,0,Ramanan S;Akarca D;Henderson SK;Rouse MA;Allinson K;Patterson K;Rowe JB;Lambon Ralph MA,,,,,,,,148,2,448-466,,"S Ramanan, 2025, Brain" +PUBMED,,,38976293,Does high hepatic bioavailability enhance the effect of oral compared to subcutaneous glucocorticoids?,Clin Exp Rheumatol,,2024,Journal Article,eng,0,van Geel EH;Boers M;Hartman L;Smulders YM,,,,,,,,42,11,2265-2267,,"EH van Geel, 2024, Clin Exp Rheumatol" +PUBMED,,,38969158,Periodontitis is an immune-related adverse event associated with immune checkpoint inhibitors: A multi-center cohort study.,Cancer Lett,,2024,Journal Article,eng,0,Ma KS;Chiang CH;Chen ST;Dinh Y;Chiang CH;Van Dyke TE;Sullivan R;Ananthakrishnan AN;Hsia YP;Peng CM;Chiang CH,,,,,,,,598,,217100,,"KS Ma, 2024, Cancer Lett" +PUBMED,,,38777744,Gradient boosted regression as a tool to reveal key drivers of temporal dynamics in a synthetic yeast community.,FEMS Microbiol Ecol,,2024,Journal Article,eng,0,Conacher CG;Watson BW;Bauer FF,,,,,,,,100,7,,,"CG Conacher, 2024, FEMS Microbiol Ecol" +PUBMED,,,38764066,Effect of emphysema on AI software and human reader performance in lung nodule detection from low-dose chest CT.,Eur Radiol Exp,,2024,Journal Article,eng,0,Sourlos N;Pelgrim G;Wisselink HJ;Yang X;de Jonge G;Rook M;Prokop M;Sidorenkov G;van Tuinen M;Vliegenthart R;van Ooijen PMA,,,,,,,,8,1,63,,"N Sourlos, 2024, Eur Radiol Exp" +PUBMED,,,38672974,"Multidimensional Impact of Dupilumab on Chronic Rhinosinusitis with Nasal Polyps: A Complete Health Technology Assessment of Clinical, Economic, and Non-Clinical Domains.",J Pers Med,,2024,Journal Article,eng,0,La Mantia I;Ottaviano G;Ragusa M;Trimarchi M;Foglia E;Schettini F;Bellavia D;Cantone E,,,,,,,,14,4,,,"I La Mantia, 2024, J Pers Med" +PUBMED,,,38671166,Thromboprophylaxis for outpatients with COVID-19: a Systematic Review and Meta-analysis.,J Thromb Thrombolysis,,2024,Systematic Review,eng,0,Chiang CH;Ahmed O;Liu W;See XY;Chang YC;Peng CY;Wang Z;Chiang CH;Hsia YP;Chiang CH,,,,,,,,57,5,784-787,,"CH Chiang, 2024, J Thromb Thrombolysis" +PUBMED,,,38564640,"OASIS: An interpretable, finite-sample valid alternative to Pearson's X(2) for scientific discovery.",Proc Natl Acad Sci U S A,,2024,Journal Article,eng,0,Baharav TZ;Tse D;Salzman J,,,,,,,,121,15,e2304671121,,"TZ Baharav, 2024, Proc Natl Acad Sci U S A" +PUBMED,,,38422606,Complex interventions in frail older adults.,Arch Gerontol Geriatr,,2024,Letter,eng,0,Van der Elst MCJ;Schoenmakers B;Schols JMGA;De Witte N;De Lepeleire J,,,,,,,,122,,105372,,"MCJ Van der Elst, 2024, Arch Gerontol Geriatr" +PUBMED,,,38347856,Leisure time physical activity is associated with improved diastolic heart function and is partly mediated by unsupervised quantified metabolic health.,BMJ Open Sport Exerc Med,,2024,Journal Article,eng,0,Klarenberg H;van der Velde JH;Peeters CF;Dekkers IA;de Mutsert R;Jukema JW;Rosendaal FR;Leiner T;Froeling M;Jorstad H;Boekholdt SM;Strijkers GJ;Lamb HJ,,,,,,,,10,1,e001778,,"H Klarenberg, 2024, BMJ Open Sport Exerc Med" +PUBMED,,,38318999,Barbed sutures and skin adhesives improve wound closure in hip and knee arthroplasty.,Knee Surg Sports Traumatol Arthrosc,,2024,Journal Article,eng,0,Romanini E;Zanoli GA;Ascione T;Balato G;Baldini A;Foglia E;Pellegrini AV;Verde F;Zaffagnini S,,,,,,,,32,2,303-310,,"E Romanini, 2024, Knee Surg Sports Traumatol Arthrosc" +PUBMED,,,38301750,CLARUS: An interactive explainable AI platform for manual counterfactuals in graph neural networks.,J Biomed Inform,,2024,Journal Article,eng,0,Metsch JM;Saranti A;Angerschmid A;Pfeifer B;Klemt V;Holzinger A;Hauschild AC,,,,,,,,150,,104600,,"JM Metsch, 2024, J Biomed Inform" +PUBMED,,,38287951,SpectroFood dataset: A comprehensive fruit and vegetable hyperspectral meta-dataset for dry matter estimation.,Data Brief,,2024,Journal Article,eng,0,Malounas I;Vierbergen W;Kutluk S;Zude-Sasse M;Yang K;Zhao M;Argyropoulos D;Van Beek J;Ampe E;Fountas S,,,,,,,,52,,110040,,"I Malounas, 2024, Data Brief" +PUBMED,,,38262670,Impact of different corticosteroids on severe community-acquired pneumonia: a systematic review and meta-analysis.,BMJ Open Respir Res,,2024,Meta-Analysis,eng,0,See XY;Wang TH;Chang YC;Lo J;Liu W;Choo CYW;Lee YC;Ma KSK;Chiang CH;Hsia YP;Chiang CH;Chiang CH,,,,,,,,11,1,,,"XY See, 2024, BMJ Open Respir Res" +PUBMED,,,38104536,The Effect of Exercise on Cardiotoxicity in Women with Breast Cancer Receiving Anthracycline-Based Chemotherapy: A Systematic Review and Meta-Analysis.,Oncology,,2024,Systematic Review,eng,0,Chiang CH;Chang YC;Haw Y;Tan JY;Chiang CH;Hsia YP;Chiang CH,,,,,,,,102,6,510-514,,"CH Chiang, 2024, Oncology" +PUBMED,,,38081880,A CNN based m5c RNA methylation predictor.,Sci Rep,,2023,Journal Article,eng,0,Aslam I;Shah S;Jabeen S;ELAffendi M;A Abdel Latif A;Ul Haq N;Ali G,,,,,,,,13,1,21885,,"I Aslam, 2023, Sci Rep" +PUBMED,,,38053104,21st century (clinical) decision support in nursing and allied healthcare. Developing a learning health system: a reasoned design of a theoretical framework.,BMC Med Inform Decis Mak,,2023,Journal Article,eng,0,van Velzen M;de Graaf-Waar HI;Ubert T;van der Willigen RF;Muilwijk L;Schmitt MA;Scheper MC;van Meeteren NLU,,,,,,,,23,1,279,,"M van Velzen, 2023, BMC Med Inform Decis Mak" +PUBMED,,,38026834,Multiple imputation of missing data in multilevel ecological momentary assessments: an example using smoking cessation study data.,Front Digit Health,,2023,Journal Article,eng,0,Ji L;Li Y;Potter LN;Lam CY;Nahum-Shani I;Wetter DW;Chow SM,,,,,,,,5,,1099517,,"L Ji, 2023, Front Digit Health" +PUBMED,,,37961606,"OASIS: An interpretable, finite-sample valid alternative to Pearson's X2 for scientific discovery.",bioRxiv,,2023,Preprint,eng,0,Baharav TZ;Tse D;Salzman J,,,,,,,,,,,,"TZ Baharav, 2023, bioRxiv" +PUBMED,,,37926930,Optimal dose for the efficacy of asenapine in patients with schizophrenia: Real-world data.,Neuropsychopharmacol Rep,,2024,Meta-Analysis,eng,0,Takekita Y;Hiraoka S;Iwama Y;Matsui D;Aoki N;Ogata H;Funatsuki T;Shimizu T;Murase Y;Koshikawa Y;Kato M;Kinoshita T,,,,,,,,44,1,234-239,,"Y Takekita, 2024, Neuropsychopharmacol Rep" +PUBMED,,,37915525,What does it mean to be an agent?,Front Psychol,,2023,Journal Article,eng,0,Naidoo M,,,,,,,,14,,1273470,,"M Naidoo, 2023, Front Psychol" +PUBMED,,,37907469,Author Correction: Non-pharmaceutical interventions to combat COVID-19 in the Americas described through daily sub-national data.,Sci Data,,2023,Published Erratum,eng,0,Touchton M;Knaul FM;Arreola-Ornelas H;Porteny T;Carniado ÓM;Faganello M;Hummel C;Otero S;Insua J;Patino F;Undurraga E;PĂ©rez-Cruz P;Sanchez-Talanquer M;Velasco Guachalla VX;Nelson-Nuñez J;Boulding C;Calderon-Anyosa R;Garcia PJ;Vargas Enciso V,,,,,,,,10,1,751,,"M Touchton, 2023, Sci Data" +PUBMED,,,37899087,Are maternal vaccines effective and safe for mothers and infants? A systematic review and meta-analysis of randomised controlled trials.,BMJ Glob Health,,2023,Meta-Analysis,eng,0,de Bruin O;Phijffer E;Ahmadizar F;van der Maas N;Wildenbeest J;Sturkenboom M;Bont L;Bloemenkamp K,,,,,,,,8,10,,,"O de Bruin, 2023, BMJ Glob Health" +PUBMED,,,37866733,Ketamine versus electroconvulsive therapy for major depressive disorder: A deeper dive into the data.,J Affect Disord,,2024,Letter,eng,0,Wang Z;Chiang CH;Hsia YP;Chiang CH;Chiang CH,,,,,,,,345,,120-121,,"Z Wang, 2024, J Affect Disord" +PUBMED,,,37865630,Non-pharmaceutical interventions to combat COVID-19 in the Americas described through daily sub-national data.,Sci Data,,2023,Dataset,eng,0,Touchton M;Knaul FM;Arreola-Ornelas H;Porteny T;Carniado ÓM;Faganello M;Hummel C;Otero S;Insua J;Patino F;Undurraga E;PĂ©rez-Cruz P;Sanchez-Talanquer M;Velasco Guachalla VX;Nelson-Nuñez J;Boulding C;Calderon-Anyosa R;Garcia PJ;Vargas Enciso V,,,,,,,,10,1,734,,"M Touchton, 2023, Sci Data" +PUBMED,,,37853448,"Healthcare professional and manager perceptions on drivers, benefits, and challenges of telemedicine: results from a cross-sectional survey in the Italian NHS.",BMC Health Serv Res,,2023,Journal Article,eng,0,Antonacci G;Benevento E;Bonavitacola S;Cannavacciuolo L;Foglia E;Fusi G;Garagiola E;Ponsiglione C;Stefanini A,,,,,,,,23,1,1115,,"G Antonacci, 2023, BMC Health Serv Res" +PUBMED,,,37792852,Characterization of 3D organotypic epithelial tissues reveals tonsil-specific differences in tonic interferon signaling.,PLoS One,,2023,Journal Article,eng,0,Jackson R;Rajadhyaksha EV;Loeffler RS;Flores CE;Van Doorslaer K,,,,,,,,18,10,e0292368,,"R Jackson, 2023, PLoS One" +PUBMED,,,37787143,Influence of nanopore coating patterns on the translocation dynamics of polyelectrolytes.,J Chem Phys,,2023,Journal Article,eng,0,Datar A;Tanyhin B;Melchionna S;Fyta M,,,,,,,,159,13,,,"A Datar, 2023, J Chem Phys" +PUBMED,,,37778607,Adopting a child perspective for exposome research on mental health and cognitive development - Conceptualisation and opportunities.,Environ Res,,2023,Journal Article,eng,0,Persson Waye K;Löve J;Lercher P;Dzhambov AM;Klatte M;Schreckenberg D;Belke C;Leist L;Ristovska G;Jeram S;Kanninen KM;Selander J;Arat A;Lachmann T;Clark C;Botteldooren D;White K;Julvez J;Foraster M;Kaprio J;Bolte G;Psyllidis A;Gulliver J;Boshuizen H;Bozzon A;Fels J;Hornikx M;van den Hazel P;Weber M;Brambilla M;Braat-Eggen E;Van Kamp I;Vincens N;Equal-life Scientific Team,,,,,,,,239,Pt 1,117279,,"K Persson Waye, 2023, Environ Res" +PUBMED,,,37769849,Challenges of artificial intelligence in precision oncology: public-private partnerships including national health agencies as an asset to make it happen.,Ann Oncol,,2024,Editorial,eng,0,Luu VP;Fiorini M;Combes S;Quemeneur E;Bonneville M;Bousquet PJ,,,,,,,,35,2,154-158,,"VP Luu, 2024, Ann Oncol" +PUBMED,,,37644607,Effects of corticosteroids on severe community-acquired pneumonia: a closer look at the evidence.,Crit Care,,2023,Letter,eng,0,Chiang CH;See XY;Wang TH;Chang YC;Lo JE;Liu WT;Choo CYW;Chiang CH;Hsia YP;Chiang CH,,,,,,,,27,1,336,,"CH Chiang, 2023, Crit Care" +PUBMED,,,37360931,Data-driven customer acceptance for attended home delivery.,OR Spectr,,2023,Journal Article,eng,0,Köhler C;Campbell AM;Ehmke JF,,,,,,,,,,1-36,,"C Köhler, 2023, OR Spectr" +PUBMED,,,37333015,Brief Report: A Multidisciplinary Initial Workup for Suspected Lung Cancer as Fast-Track Intervention to Histopathologic Diagnosis.,JTO Clin Res Rep,,2023,Journal Article,eng,0,Pujol JL;Mercier G;Vasile M;Serre I;Vernhet-Kovacsik H;Bommart S,,,,,,,,4,6,100526,,"JL Pujol, 2023, JTO Clin Res Rep" +PUBMED,,,37327210,Predicted versus CT-derived total lung volume in a general population: The ImaLife study.,PLoS One,,2023,Journal Article,eng,0,Wisselink HJ;Steerenberg DJD;Rook M;Pelgrim GJ;Heuvelmans MA;van den Berge M;de Bock GH;Vliegenthart R,,,,,,,,18,6,e0287383,,"HJ Wisselink, 2023, PLoS One" +PUBMED,,,37277211,Characterisation of patients with axial psoriatic arthritis and patients with axial spondyloarthritis and concomitant psoriasis in the SCQM registry.,RMD Open,,2023,Journal Article,eng,0,Ciurea A;Götschi A;Kissling S;Bernatschek A;BĂŒrki K;Exer P;Nissen MJ;Möller B;Scherer A;Micheroli R,,,,,,,,9,2,,,"A Ciurea, 2023, RMD Open" +PUBMED,,,37250091,System Integrated Digital Empowering and teleRehabilitation to promote patient Activation and well-Being in chronic disabilities: A usability and acceptability study.,Front Public Health,,2023,Journal Article,eng,0,Rossetto F;Borgnis F;Isernia S;Foglia E;Garagiola E;Realdon O;Baglio F,,,,,,,,11,,1154481,,"F Rossetto, 2023, Front Public Health" +PUBMED,,,37243964,"Authors' Reply to Juergen O Kirchner's Comment on ""Incidence Rates of Autoimmune Diseases in European Healthcare Databases: A Contribution of the ADVANCE Project"".",Drug Saf,,2023,Letter,eng,0,Willame C;Weibel D;Sturkenboom MCJM,,,,,,,,46,8,813-815,,"C Willame, 2023, Drug Saf" +PUBMED,,,37231860,Toward a Flexible Use of Frailty Measurements in Older People.,Gerontology,,2023,Letter,eng,0,Van der Elst MCJ;Schoenmakers B;Schols JMGA;De Witte N;De Lepeleire J;D-SCOPE Consortium,,,,,,,,69,9,1113-1114,,"MCJ Van der Elst, 2023, Gerontology" +PUBMED,,,37198019,Assessing vaccine safety during a pandemic: Recent experience and lessons learned for the future.,Vaccine,,2023,Review,eng,0,Black SB;Chandler RE;Edwards KM;Sturkenboom MCJM,,,,,,,,41,25,3790-3795,,"SB Black, 2023, Vaccine" diff --git a/test_etl.py b/test_etl.py new file mode 100644 index 000000000..5dbe438b7 --- /dev/null +++ b/test_etl.py @@ -0,0 +1,85 @@ +import sys +import os +import pandas as pd + +# Add the project root to sys.path +sys.path.append(os.path.abspath(os.path.dirname(__file__))) + +from www.services.etl import ETLPipeline +from functions.get_annualproduction import get_annual_production +from functions.get_relevantauthors import get_relevant_authors +from functions.get_frequentwords import get_frequent_words +from functions.get_wordcloud import get_wordcloud +from functions.get_averagecitations import get_average_citations + +class MockReactive: + def __init__(self, df): + self.df = df + def get(self): + return self.df + def set(self, value): + self.df = value + +def test_api_extraction(): + print("=== Testing ETL Pipeline with OpenAlex API ===") + try: + # Extract and standardize + query = "machine learning bibliometrics" + print(f"Querying OpenAlex for: '{query}'...") + df = ETLPipeline.convert2df(source_data="API", source_type="OpenAlex", is_api=True, query=query) + print(f"Successfully extracted and standardized {len(df)} records.") + print("Columns:", df.columns.tolist()) + print("Sample of SR column:", df['SR'].head(3).tolist()) + + # Wrap the DataFrame to simulate Shiny's reactive.Value.get() + reactive_df = MockReactive(df) + + # Test analytical functions + print("\n--- Testing Analytical Functions ---") + + # 1. Annual Production + try: + print("1. get_annual_production...") + res = get_annual_production(reactive_df) + print("Success!") + except Exception as e: + print(f"Failed: {e}") + + # 2. Relevant Authors + try: + print("2. get_relevant_authors...") + res = get_relevant_authors(reactive_df, num_of_authors=10) + print("Success!") + except Exception as e: + print(f"Failed: {e}") + + # 3. Frequent Words + try: + print("3. get_frequent_words...") + # We need to simulate the parameters the function expects + res = get_frequent_words(reactive_df, ngram=1, num_of_words=10, word_type="TI", file_upload_terms=None, file_upload_synonyms=None) + print("Success!") + except Exception as e: + print(f"Failed: {e}") + + # 4. WordCloud + try: + print("4. get_wordcloud...") + res = get_wordcloud(reactive_df, ngram=1, num_of_words_wc=10, field_wc="TI", file_upload_terms_wc=None, file_upload_synonyms_wc=None) + print("Success!") + except Exception as e: + print(f"Failed: {e}") + + # 5. Average Citations + try: + print("5. get_average_citations...") + res = get_average_citations(reactive_df) + print("Success!") + except Exception as e: + print(f"Failed: {e}") + + except Exception as e: + print(f"Pipeline execution failed: {e}") + +if __name__ == "__main__": + test_api_extraction() diff --git a/test_histnetwork.py b/test_histnetwork.py new file mode 100644 index 000000000..11ccf6a62 --- /dev/null +++ b/test_histnetwork.py @@ -0,0 +1,35 @@ +import sys +sys.path.append('.') +from www.services.etl import ETLPipeline +from www.services.histnetwork import histNetwork +import pandas as pd +from shiny import reactive + +def test_histnetwork(): + # 1. Fetch data from OpenAlex API + query = "machine learning" + print(f"Fetching data for query: {query}") + df = ETLPipeline.convert2df(source_data="API", source_type="OpenAlex", is_api=True, query=query) + + print("\nStarting histNetwork test...") + # histNetwork requires a reactive.Value according to standard implementation, + # but the function itself calls df.get(). + # Let's wrap it in a mock object with a .get() method if necessary, + # but looking at histNetwork: M = df.get() if hasattr(df, 'get') else df + class MockReactive: + def __init__(self, val): + self.val = val + def get(self): + return self.val + + rv_df = MockReactive(df) + results = histNetwork(rv_df, network=True) + + if results is not None: + print("\nhistNetwork executed successfully!") + print(f"NetMatrix shape: {results['NetMatrix'].shape}") + else: + print("\nhistNetwork failed.") + +if __name__ == '__main__': + test_histnetwork() diff --git a/test_perf.py b/test_perf.py new file mode 100644 index 000000000..8f235c5ef --- /dev/null +++ b/test_perf.py @@ -0,0 +1,28 @@ +import time +import sys +sys.path.append('.') +from www.services.etl import ETLPipeline +from www.services.histnetwork import histNetwork + +def test_perf(): + print("Fetching...") + t0 = time.time() + df = ETLPipeline.convert2df(source_data='API', source_type='OpenAlex', is_api=True, query='machine learning') + t1 = time.time() + print(f"Fetched {len(df)} rows in {t1-t0:.2f}s") + + class MockReactive: + def __init__(self, val): + self.val = val + def get(self): + return self.val + + rv_df = MockReactive(df) + print("Running histNetwork...") + t2 = time.time() + res = histNetwork(rv_df, network=True) + t3 = time.time() + print(f"histNetwork completed in {t3-t2:.2f}s") + +if __name__ == '__main__': + test_perf() diff --git a/www/services/__init__.py b/www/services/__init__.py index 28584e105..ede5ff3d4 100644 --- a/www/services/__init__.py +++ b/www/services/__init__.py @@ -14,4 +14,8 @@ from .tabletag import * from .termextraction import * from .thematicmap import * -from .utils import * \ No newline at end of file +from .utils import * +from .api_retriever import * +from .standardizer import * +from .validator import * +from .etl import * \ No newline at end of file diff --git a/www/services/api_retriever.py b/www/services/api_retriever.py new file mode 100644 index 000000000..b305b04bc --- /dev/null +++ b/www/services/api_retriever.py @@ -0,0 +1,125 @@ +import requests +import time +import math +from typing import List, Dict, Any + +class APIRetriever: + """ + Handles data extraction from bibliographic APIs such as OpenAlex and PubMed. + + This class abstracts the raw HTTP requests, pagination, and rate limiting + associated with fetching metadata from external sources. + """ + + @staticmethod + def get_openalex(query: str, max_results: int = 100) -> List[Dict[str, Any]]: + """ + Fetches bibliographic records from the OpenAlex API based on a search query. + Handles API pagination and basic rate limiting to politely extract data. + + Args: + query (str): The keyword search string to query OpenAlex works. + max_results (int, optional): The maximum number of records to retrieve. Defaults to 100. + + Returns: + List[Dict[str, Any]]: A list of dictionaries, where each dictionary represents + a single bibliographic record retrieved from OpenAlex. + """ + base_url = "https://api.openalex.org/works" + results = [] + per_page = min(max_results, 50) + + try: + # Initial request to get total count + params = { + "search": query, + "per-page": per_page, + "page": 1, + "mailto": "test@example.com" # Polite pool + } + response = requests.get(base_url, params=params, timeout=10) + response.raise_for_status() + data = response.json() + + meta = data.get("meta", {}) + total_count = meta.get("count", 0) + if total_count == 0: + return results + + results.extend(data.get("results", [])) + + # Fetch remaining pages if needed + pages_needed = math.ceil(min(total_count, max_results) / per_page) + for page in range(2, pages_needed + 1): + time.sleep(0.1) # Rate limit respect + params["page"] = page + response = requests.get(base_url, params=params, timeout=10) + if response.status_code == 200: + data = response.json() + results.extend(data.get("results", [])) + + # Limit exactly to max_results + return results[:max_results] + except Exception as e: + print(f"Error retrieving from OpenAlex: {e}") + return results + + @staticmethod + def get_pubmed(query: str, max_results: int = 100) -> List[Dict[str, Any]]: + """ + Fetches bibliographic records from the PubMed API using NCBI E-utilities. + This is a two-step process: first fetching PMIDs via esearch, then fetching + document summaries via esummary, adhering to the NIH limit of 3 requests per second. + + Args: + query (str): The keyword search string to query PubMed. + max_results (int, optional): The maximum number of records to retrieve. Defaults to 100. + + Returns: + List[Dict[str, Any]]: A list of dictionaries representing PubMed summaries + mapped directly from the JSON response. + """ + search_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" + summary_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi" + results = [] + + try: + # Step 1: Get PMIDs + search_params = { + "db": "pubmed", + "term": query, + "retmax": max_results, + "retmode": "json" + } + search_response = requests.get(search_url, params=search_params, timeout=10) + search_response.raise_for_status() + search_data = search_response.json() + id_list = search_data.get("esearchresult", {}).get("idlist", []) + + if not id_list: + return results + + # Step 2: Get summaries for these PMIDs + # PubMed limits to 200-300 ids per GET request, we'll chunk by 100 + chunk_size = 100 + for i in range(0, len(id_list), chunk_size): + chunk = id_list[i:i + chunk_size] + time.sleep(0.34) # NIH allows 3 requests per second + summary_params = { + "db": "pubmed", + "id": ",".join(chunk), + "retmode": "json" + } + sum_response = requests.get(summary_url, params=summary_params, timeout=10) + if sum_response.status_code == 200: + sum_data = sum_response.json() + result_dict = sum_data.get("result", {}) + # uids are stored in result["uids"], actual data in result[uid] + for uid in result_dict.get("uids", []): + if uid in result_dict: + results.append(result_dict[uid]) + + return results + except Exception as e: + print(f"Error retrieving from PubMed: {e}") + return results diff --git a/www/services/etl.py b/www/services/etl.py new file mode 100644 index 000000000..ed18cfd0a --- /dev/null +++ b/www/services/etl.py @@ -0,0 +1,128 @@ +import pandas as pd +from typing import Union, List, Dict, Any +from .api_retriever import APIRetriever +from .standardizer import Standardizer +from .validator import Validator +from .format_functions import format_sr_column + +class ETLPipeline: + """ + Main ETL Pipeline dispatcher for Bibliometrix. + + This class acts as a central Dispatcher. It evaluates the type and origin of + the input source data, routes it to the appropriate Extractor (API or file parser), + passes it through the Standardizer to enforce the schema, calculates secondary + tags (like SR), and finally validates the output. + """ + + @classmethod + def convert2df(cls, source_data: Union[str, pd.DataFrame, List[Dict[str, Any]]], + source_type: str, + is_api: bool = False, + query: str = "", + original_filename: str = "") -> pd.DataFrame: + """ + The main dispatcher function executing the Extract -> Transform -> Validate -> Load pipeline. + + Args: + source_data (Union[str, pd.DataFrame, List[Dict[str, Any]]]): The raw data source. + Can be a filepath, a raw DataFrame, or a list of dictionaries. + source_type (str): The origin of the data (e.g., "Scopus", "Dimensions", "PubMed", "OpenAlex"). + is_api (bool, optional): Flag indicating whether extraction should occur via live API query. Defaults to False. + query (str, optional): The API search query, required if is_api is True. Defaults to "". + original_filename (str, optional): Preserved filename used to infer data formats for manual uploads. Defaults to "". + + Returns: + pd.DataFrame: A fully standardized, validated Bibliometrix-compatible DataFrame. + + Raises: + ValueError: If the source type, file format, or API is unsupported. + """ + # Phase 1: EXTRACT + raw_data = None + if is_api: + if source_type.upper() == "OPENALEX": + raw_data = APIRetriever.get_openalex(query) + elif source_type.upper() == "PUBMED": + raw_data = APIRetriever.get_pubmed(query) + else: + raise ValueError(f"API extraction not supported for {source_type}") + else: + if isinstance(source_data, str): + # Use original_filename if provided, otherwise fallback to source_data path + file_to_check = original_filename if original_filename else source_data + + # Manual download parsing + if source_type.upper() == "SCOPUS" and file_to_check.lower().endswith('.csv'): + raw_data = pd.read_csv(source_data) + elif source_type.upper() == "DIMENSIONS" and (file_to_check.lower().endswith('.xlsx') or file_to_check.lower().endswith('.csv')): + if file_to_check.lower().endswith('.xlsx'): + raw_data = pd.read_excel(source_data, skiprows=1) + else: + raw_data = pd.read_csv(source_data, skiprows=1) + elif source_type.upper() == "PUBMED" and file_to_check.lower().endswith('.txt'): + from .parsers import parse_pubmed_data + raw_data = parse_pubmed_data(source_data) + elif source_type.upper() == "WOS": + from .parsers import parse_wos_data + raw_data = parse_wos_data(source_data) + else: + raise ValueError(f"Unsupported manual file format for {source_type} (file: {file_to_check})") + elif isinstance(source_data, pd.DataFrame): + raw_data = source_data + elif isinstance(source_data, list): + raw_data = source_data + else: + raise ValueError("Invalid source_data format") + + if len(raw_data) == 0: + raise ValueError("No data extracted.") + + # Phase 2: TRANSFORM + standardized_df = Standardizer.apply_mapping_and_types(raw_data, source_type) + + # Phase 3 & 4: CALCULATED FIELDS (SR) + # We need to apply format_sr_column. + # format_sr_column expects the entry in the specific database format. + # But we already standardized. To reuse format_sr_column, we must pass it + # simulating WoS format or the original source format. + # The easiest way is to use the original raw data row to generate SR if possible, + # or simulate a WoS entry since our dataframe is now in WoS standard schema. + + sr_list = [] + for i in range(len(standardized_df)): + row = standardized_df.iloc[i] + # Create a mock WoS entry for format_sr_column + # format_sr_column for Web_of_Science .txt expects: + # AU: list of strings (first author comma separated) + # PY: string (it takes [0], so we provide a list or string) + # SO: list of strings or a single string + + mock_entry = {} + if len(row["AU"]) > 0: + # Ensure author is comma separated (Surname, Initials) + author = row["AU"][0] + if "," not in author: + parts = author.split() + if len(parts) > 1: + author = f"{parts[-1]}, {' '.join(parts[:-1])}" + mock_entry["AU"] = [author] + else: + mock_entry["AU"] = ["Unknown, U."] + + mock_entry["PY"] = [str(row["PY"])] + mock_entry["SO"] = [str(row["SO"])] + + try: + sr = format_sr_column(mock_entry, 'Web_of_Science', '.txt') + except Exception as e: + sr = "Unknown, 0000, Unknown" + + sr_list.append(sr) + + standardized_df["SR"] = sr_list + + # Phase 5: VALIDATION + Validator.validate(standardized_df) + + return standardized_df diff --git a/www/services/histnetwork.py b/www/services/histnetwork.py index 7848d9744..10fc2c70e 100644 --- a/www/services/histnetwork.py +++ b/www/services/histnetwork.py @@ -34,7 +34,7 @@ def histNetwork(df, min_citations=0, sep=";", network=True): # Fill missing values in TC M['TC'] = M['TC'].fillna(0) - if db == "Web_of_Science": + if db in ["Web_of_Science", "OPENALEX", "PUBMED"]: results = wos(M, min_citations=min_citations, sep=sep, network=network) elif db == "Scopus": results = scopus(M, min_citations=min_citations, sep=sep, network=network) @@ -47,8 +47,6 @@ def histNetwork(df, min_citations=0, sep=";", network=True): def wos(M, min_citations, sep, network): - print("\nWOS DB:\nSearching local citations (LCS) by reference items (SR) and DOIs...\n") - # Sort data by publication year M = M.sort_values(by="PY").reset_index(drop=True) @@ -56,39 +54,70 @@ def wos(M, min_citations, sep, network): M['Paper'] = np.arange(0, len(M)) M['nLABEL'] = np.arange(0, len(M)) - # Process cited references (CR) - CR = [] - for i, refs in enumerate(M['CR']): - for ref in refs: - # Extract DOI - doi = "" - if 'DOI' in ref: - parts = ref.split('DOI', 1) - doi = parts[1].strip() if len(parts) > 1 else "" - # Extract AU, PY, SO - ref_parts = ref.split(',') - au = ref_parts[0].replace('.', ' ').strip() if len(ref_parts) > 0 else "" - py = ref_parts[1].strip() if len(ref_parts) > 1 else "" - so = ref_parts[2].strip() if len(ref_parts) > 2 else "" - sr = f"{au}, {py}, {so}" - CR.append({'ref': ref, 'Paper': i, 'DI': doi, 'AU': au, 'PY': py, 'SO': so, 'SR': sr}) - - print(f"\nAnalyzing {len(CR)} reference items...\n") - - CR_df = pd.DataFrame(CR) - - # Add LABEL field to M and CR - M['LABEL'] = M['SR_FULL'].fillna('').str.upper() + " DOI " + M['DI'].fillna('').str.upper() - M['LABEL'] = M['LABEL'].str.strip() - CR_df['LABEL'] = CR_df['SR'].fillna('').str.upper() + " DOI " + CR_df['DI'].fillna('').str.upper() - CR_df['LABEL'] = CR_df['LABEL'].str.strip() - - # Match references with papers (left join as in R) - L = pd.merge(M, CR_df, on='LABEL', how='left', suffixes=('_M', '_CR')) - L = L[L['Paper_CR'].notnull()] - L['CITING'] = M.loc[L['Paper_CR'], 'LABEL'].values - L['nCITING'] = M.loc[L['Paper_CR'], 'nLABEL'].values - L['CIT_PY'] = M.loc[L['Paper_CR'], 'PY'].values + if M['DB'].iloc[0] == "OPENALEX": + print("\nOPENALEX DB:\nSearching local citations using OpenAlex IDs (UT) and referenced works (CR)...\n") + sr_col = 'SR_FULL' if 'SR_FULL' in M.columns else 'SR' + M['LABEL'] = M[sr_col].fillna('').str.upper() + " DOI " + M['DI'].fillna('').str.upper() + M['LABEL'] = M['LABEL'].str.strip() + # Explode CR to get one row per reference + CR_df = M[['UT', 'CR', 'Paper', 'nLABEL', 'PY']].explode('CR').dropna(subset=['CR']).rename(columns={'CR': 'ref', 'Paper': 'Paper_CR', 'nLABEL': 'nCITING', 'PY': 'CIT_PY'}) + # Clean UT to match refs + M['UT_CLEAN'] = M['UT'].str.upper().str.strip() + CR_df['ref'] = CR_df['ref'].str.upper().str.strip() + + # Match references with papers (L contains all matches) + L = pd.merge(M, CR_df, left_on='UT_CLEAN', right_on='ref', how='right') + L = L[L['Paper'].notnull()] + + print("\nAfter filtering:") + print(L.shape) + + # Display the HTTPS link mapping just like the friend's output + print("\nCitation Mapping (Citing Paper -> Cited Reference):") + print(L[['UT_x', 'UT_CLEAN', 'Paper', 'Paper_CR']].head(20)) + + print(type(L)) + print(L.columns) + + L['CITING'] = M.loc[L['Paper_CR'], 'LABEL'].values + + else: + print("\nWOS/PUBMED DB:\nSearching local citations (LCS) by reference items (SR) and DOIs...\n") + # Process cited references (CR) + CR = [] + for i, refs in enumerate(M['CR']): + for ref in refs: + # Extract DOI + doi = "" + if 'DOI' in ref: + parts = ref.split('DOI', 1) + doi = parts[1].strip() if len(parts) > 1 else "" + # Extract AU, PY, SO + ref_parts = ref.split(',') + au = ref_parts[0].replace('.', ' ').strip() if len(ref_parts) > 0 else "" + py = ref_parts[1].strip() if len(ref_parts) > 1 else "" + so = ref_parts[2].strip() if len(ref_parts) > 2 else "" + sr = f"{au}, {py}, {so}" + CR.append({'ref': ref, 'Paper': i, 'DI': doi, 'AU': au, 'PY': py, 'SO': so, 'SR': sr}) + + print(f"\nAnalyzing {len(CR)} reference items...\n") + + CR_df = pd.DataFrame(CR) + + # Add LABEL field to M and CR + sr_col = 'SR_FULL' if 'SR_FULL' in M.columns else 'SR' + M['LABEL'] = M[sr_col].fillna('').str.upper() + " DOI " + M['DI'].fillna('').str.upper() + M['LABEL'] = M['LABEL'].str.strip() + if not CR_df.empty: + CR_df['LABEL'] = CR_df['SR'].fillna('').str.upper() + " DOI " + CR_df['DI'].fillna('').str.upper() + CR_df['LABEL'] = CR_df['LABEL'].str.strip() + + # Match references with papers (left join as in R) + L = pd.merge(M, CR_df, on='LABEL', how='left', suffixes=('_M', '_CR')) + L = L[L['Paper_CR'].notnull()] + L['CITING'] = M.loc[L['Paper_CR'], 'LABEL'].values + L['nCITING'] = M.loc[L['Paper_CR'], 'nLABEL'].values + L['CIT_PY'] = M.loc[L['Paper_CR'], 'PY'].values # Compute Local Citation Scores (LCS) LCS = L.groupby('nLABEL').size().reset_index(name='LCS') @@ -115,15 +144,8 @@ def wos(M, min_citations, sep, network): M.at[paper_idx, 'LCR'] = row['LCR'] # Assign unique names to duplicated LABELs - st = False - i = 0 - while not st: - ind = M['LABEL'].duplicated(keep=False) - if ind.any(): - i += 1 - M.loc[ind, 'LABEL'] = M.loc[ind, 'LABEL'] + f"-{chr(96 + i)}" - else: - st = True + counts = M.groupby('LABEL').cumcount() + M['LABEL'] = M['LABEL'] + counts.apply(lambda x: f"-{chr(96 + x)}" if x > 0 else "") M.index = M['LABEL'].str.strip() M['LCR'] = M['LCR'].fillna('') diff --git a/www/services/standardizer.py b/www/services/standardizer.py new file mode 100644 index 000000000..125b6a2b4 --- /dev/null +++ b/www/services/standardizer.py @@ -0,0 +1,177 @@ +import pandas as pd +from typing import List, Dict, Any, Union + +class Standardizer: + """ + Standardizes bibliographic data from heterogeneous sources to the Web of Science (WoS) schema. + + This class implements the Lookup Strategy pattern via a predefined MAPPING dictionary + to translate proprietary column names (e.g., OpenAlex's 'referenced_works') into + standardized WoS tags (e.g., 'CR'). It also enforces Type Contracts, ensuring + that all output data adheres to expected types (e.g., lists of strings for authors, + integers for years) and gracefully handles null values. + """ + + # WoS Standard Schema + STANDARD_COLUMNS = [ + "DB", "UT", "DI", "PMID", "TI", "SO", "JI", "PY", "DT", "LA", "TC", + "AU", "AF", "C1", "RP", "CR", "DE", "ID", "AB", "VL", "IS", "BP", "EP", "SR" + ] + + LIST_COLUMNS = ["AU", "AF", "C1", "CR", "DE", "ID"] + INTEGER_COLUMNS = ["TC", "PY"] + + # Mapping Dictionary (Lookup Strategy) + # Maps proprietary column names to WoS tags. + # Extend this as needed for Scopus, Dimensions, etc. + MAPPING = { + # OpenAlex Mapping + "id": "UT", + "doi": "DI", + "title": "TI", + "publication_year": "PY", + "type": "DT", + "language": "LA", + "cited_by_count": "TC", + "referenced_works": "CR", + # PubMed Mapping + "uid": "PMID", + # "title" is already mapped above to "TI" + "source": "SO", + "pubdate": "PY", + "pubtype": "DT", + "lang": "LA", + "pmc": "UT", + "volume": "VL", + "issue": "IS", + "pages": "BP", + # Scopus Mapping + "Authors": "AU", + "Author(s) ID": "AF", + "Title": "TI", + "Year": "PY", + "Source title": "SO", + "Volume": "VL", + "Issue": "IS", + "Page start": "BP", + "Page end": "EP", + "Cited by": "TC", + "DOI": "DI", + "Document Type": "DT", + "Source": "DB", + "Affiliations": "C1", + "Author Keywords": "DE", + "Index Keywords": "ID", + "Abstract": "AB", + # Dimensions Mapping + "Publication ID": "UT", + "PubYear": "PY", + "Journal": "SO", + "Times cited": "TC", + } + + @staticmethod + def _parse_multi_value(val: Any, delimiter: str = ";") -> List[str]: + """Parses a multi-value string or list into a list of strings.""" + if val is None or (isinstance(val, float) and pd.isna(val)): + return [] + if isinstance(val, list): + return [str(v).strip() for v in val if v is not None and not (isinstance(v, float) and pd.isna(v))] + if isinstance(val, str): + if val.strip() == "": + return [] + return [v.strip() for v in val.split(delimiter) if v.strip()] + return [str(val).strip()] + + @staticmethod + def _parse_scalar_str(val: Any) -> str: + if val is None or (isinstance(val, float) and pd.isna(val)): + return "" + if isinstance(val, list): + return str(val[0]) if len(val) > 0 else "" + return str(val).strip() + + @staticmethod + def _parse_scalar_int(val: Any) -> Union[int, str]: + if val is None or (isinstance(val, float) and pd.isna(val)): + return 0 + if isinstance(val, list): + val = val[0] if len(val) > 0 else 0 + try: + # Handle float values like 2024.0 or strings like "2024" + if isinstance(val, str): + import re + match = re.search(r'\d{4}', val) + if match: + return int(match.group()) + return int(float(val)) + except (ValueError, TypeError): + return 0 + + @classmethod + def apply_mapping_and_types(cls, data: Union[pd.DataFrame, List[Dict[str, Any]]], db_source: str) -> pd.DataFrame: + """ + Main method to standardize the input data. + + Executes the standardization pipeline: + 1. Translates columns using the MAPPING dictionary. + 2. Ensures all standard WoS columns are present. + 3. Extracts nested JSON fields for specific APIs (like OpenAlex/PubMed). + 4. Enforces strict type contracts via parsing methods. + + Args: + data (Union[pd.DataFrame, List[Dict[str, Any]]]): The raw data fetched from an API or file. + db_source (str): The source identifier (e.g., "OPENALEX", "PUBMED", "SCOPUS"). + + Returns: + pd.DataFrame: A normalized DataFrame strictly conforming to the WoS schema. + """ + if isinstance(data, list): + df = pd.DataFrame(data) + else: + df = data.copy() + + # 1. Rename columns using mapping + df.rename(columns=cls.MAPPING, inplace=True) + + # 2. Ensure all standard columns exist + for col in cls.STANDARD_COLUMNS: + if col not in df.columns: + df[col] = None + + # 3. Explicitly set DB if not present + df["DB"] = db_source.upper() + + # Extract features from nested JSON for OpenAlex and PubMed if needed + # OpenAlex authors + if db_source.upper() == "OPENALEX": + if "authorships" in df.columns: + df["AU"] = df["authorships"].apply( + lambda x: [author.get("author", {}).get("display_name", "") for author in x] if isinstance(x, list) else [] + ) + df["C1"] = df["authorships"].apply( + lambda x: [inst.get("display_name", "") for author in x for inst in author.get("institutions", [])] if isinstance(x, list) else [] + ) + if "host_venue" in df.columns: # Older API format + df["SO"] = df["host_venue"].apply(lambda x: x.get("display_name", "") if isinstance(x, dict) else "") + elif "primary_location" in df.columns: # Newer API format + df["SO"] = df["primary_location"].apply(lambda x: x.get("source", {}).get("display_name", "") if isinstance(x, dict) and x.get("source") else "") + + # PubMed authors + elif db_source.upper() == "PUBMED": + if "authors" in df.columns: + df["AU"] = df["authors"].apply( + lambda x: [author.get("name", "") for author in x] if isinstance(x, list) else [] + ) + + # 4. Type Enforcement and Null Handling + for col in cls.STANDARD_COLUMNS: + if col in cls.LIST_COLUMNS: + df[col] = df[col].apply(cls._parse_multi_value) + elif col in cls.INTEGER_COLUMNS: + df[col] = df[col].apply(cls._parse_scalar_int) + else: + df[col] = df[col].apply(cls._parse_scalar_str) + + # Return only the standard columns + return df[cls.STANDARD_COLUMNS] diff --git a/www/services/validator.py b/www/services/validator.py new file mode 100644 index 000000000..96b0ede6a --- /dev/null +++ b/www/services/validator.py @@ -0,0 +1,67 @@ +import pandas as pd +from typing import List + +class Validator: + """ + Validates that the standardized DataFrame complies strictly with the Web of Science schema. + + This acts as the final gatekeeper in the ETL pipeline. It ensures that + the transformed DataFrame possesses all required columns, does not contain + null values, and adheres strictly to the predefined Type Contracts (e.g., lists + for multi-value fields, integers for years/citations). + """ + + STANDARD_COLUMNS = [ + "DB", "UT", "DI", "PMID", "TI", "SO", "JI", "PY", "DT", "LA", "TC", + "AU", "AF", "C1", "RP", "CR", "DE", "ID", "AB", "VL", "IS", "BP", "EP", "SR" + ] + + LIST_COLUMNS = ["AU", "AF", "C1", "CR", "DE", "ID"] + INTEGER_COLUMNS = ["TC", "PY"] + + @classmethod + def validate(cls, df: pd.DataFrame) -> bool: + """ + Executes strict validation checks on the standardized DataFrame. + + Args: + df (pd.DataFrame): The DataFrame processed by the Standardizer. + + Returns: + bool: True if the DataFrame passes all validation checks. + + Raises: + ValueError: If the DataFrame is empty, is missing mandatory columns, + contains null values, or violates Type Contracts. + """ + if df.empty: + raise ValueError("Validation Error: DataFrame is empty.") + + # 1. Check Mandatory Columns + missing_cols = [col for col in cls.STANDARD_COLUMNS if col not in df.columns] + if missing_cols: + raise ValueError(f"Validation Error: Missing mandatory columns: {missing_cols}") + + # 2. Check for Nulls (NaN or None) + null_counts = df.isnull().sum() + if null_counts.sum() > 0: + raise ValueError(f"Validation Error: DataFrame contains null values:\n{null_counts[null_counts > 0]}") + + # 3. Check Data Types + for col in cls.STANDARD_COLUMNS: + sample_val = df[col].iloc[0] if not df.empty else None + if col in cls.LIST_COLUMNS: + if not df[col].apply(lambda x: isinstance(x, list)).all(): + raise ValueError(f"Validation Error: Column '{col}' must be a list of strings.") + elif col in cls.INTEGER_COLUMNS: + if not pd.api.types.is_integer_dtype(df[col]) and not df[col].apply(lambda x: isinstance(x, int)).all(): + # Check if all can be cast to int, but standardizer should have enforced it + try: + df[col].astype(int) + except ValueError: + raise ValueError(f"Validation Error: Column '{col}' must contain integers.") + else: + if not pd.api.types.is_string_dtype(df[col]) and not df[col].apply(lambda x: isinstance(x, str)).all(): + raise ValueError(f"Validation Error: Column '{col}' must contain strings.") + + return True