diff --git a/.gitignore b/.gitignore index 23b99e089..097e5d010 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,17 @@ __pycache__/ bibliovenv/ Bibenv/ -.idea/ \ No newline at end of file +.idea/ + +# ETL test files +scopus_test.csv +scopus_standardized.csv +openalex_standardized.csv +pubmed_standardized.csv +test_etl.py + +# Jupyter checkpoints +.ipynb_checkpoints/ + +pubmed_test.csv +pubmed_test.txt \ No newline at end of file diff --git a/app.py b/app.py index f0891f894..daa39d125 100644 --- a/app.py +++ b/app.py @@ -55,14 +55,19 @@ import pandas as pd import io from functions import * -from www.services import * +from www.services.utils import * from google import genai -from shiny import express -from shiny import render, ui from google.genai import types -from shiny import reactive, render -from shinywidgets import render_widget +from shiny import reactive +from shiny import express from shiny.express import ui, input, render +from shinywidgets import render_widget +from functions.get_database import get_database +from functions.get_data import get_data +from functions.get_table import get_table +from functions.get_filters import get_filtered_table +from www.services.savereport import add_to_report +from functions.get_filters import get_filtered_table # Setup the Directory for static assets - optimized for performance base_dir = tempfile.gettempdir() # Use system temp dir instead of creating new temp file @@ -760,7 +765,7 @@ def show_data(): @render.ui @reactive.event(input.start_button) def show_table(): - table_ui, _, _ = get_table(database, df) + table_ui, _, _ = get_table(database, df.get()) return table_ui # -------- ADVICE BUTTON -------- @@ -788,7 +793,7 @@ def close_advice_notification(): @render.ui @reactive.event(input.report_modal_completeness) def show_missing_data_report(): - _, missingData, _ = get_table(database, df, modal=False) + _, missingData, _ = get_table(database, df.get(), modal=False) dataframe = pd.read_html(io.StringIO(missingData)) report_excel.set(add_to_report(report_choices, report_excel, [dataframe[0]], [], "missingdata")) selection.set(selection.get() + (f"{list(report_choices.get().keys())[-1]}",)) @@ -801,7 +806,7 @@ def show_missing_data_report(): @render.ui @reactive.event(input.save_modal_completeness) def save_dataframe_image(): - _, _, fig = get_table(database, df, dpi=dpi.get(), modal=False) + _, _, fig = get_table(database, df.get(), dpi=dpi.get(), modal=False) fig.write_image(completeness_table_image_path) return ui.notification_show(f"✅ Missing data image saved into {completeness_table_image_path}", duration=5, close_button=False) @@ -868,7 +873,7 @@ def indicator_types_ui_all(): @reactive.calc def filters(): - return get_filters(df) + return get_filters(df.get()) with ui.layout_sidebar(fillable=False, fill=False): # Sidebar for data import options @@ -1060,7 +1065,7 @@ def loading_modal(): return ui.HTML(str(modal) + js) ui.modal_show(loading_modal()) try: - result = get_main_informations(df) + result = get_main_informations(df.get()) return result finally: ui.modal_remove() @@ -1174,7 +1179,7 @@ def table_informations(): data['Average_Citations_per_Doc'][0] ] }) - return ui.HTML(DT(df_box, style="width=100%;")) + return ui.HTML(DT(df_box, style="width:100%;")) # --- Annual Scientific Production Section --- with ui.nav_panel("None", value="annual_scientific_production"): @@ -1215,7 +1220,7 @@ def show_annual_production_report(): with ui.card(full_screen=True): @reactive.calc def annual_informations(): - return get_annual_production(df) + return get_annual_production(df.get()) with ui.navset_underline(id="annual_tab"): with ui.nav_panel("Plot"): @@ -1228,7 +1233,7 @@ def show_annual_production(): @render.ui def table_annual_production(): _, publications_per_year = annual_informations() - return ui.HTML(DT(publications_per_year, style="width=100%;")) + return ui.HTML(DT(publications_per_year, style="width:100%;")) # AI bot Gemini Chat Integration # --- Floating Chat Button --- @@ -1369,7 +1374,7 @@ def show_average_citations_report(): with ui.card(full_screen=True): @reactive.calc def average_citations(): - return get_average_citations(df) + return get_average_citations(df.get()) with ui.navset_underline(id="average_tab"): with ui.nav_panel("Plot"): @@ -1382,7 +1387,7 @@ def show_average_citations(): @render.ui def table_average_citations(): _, avg_citations = average_citations() - return ui.HTML(DT(avg_citations, style="width=100%;")) + return ui.HTML(DT(avg_citations, style="width:100%;")) # --- Three-Field Plot Section --- with ui.nav_panel("None", value="three_field_plot"): @@ -1467,7 +1472,7 @@ def calculate_three_field_plot(): middle_field_items = input.middle_field_items() right_field_items = input.right_field_items() - result = get_three_field_plot(df, left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items) + result = get_three_field_plot(df.get(),left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items) three_field_plot_results.set(result) finally: ui.modal_remove() @@ -1601,7 +1606,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: num_of_sources = input.num_of_sources() - result = get_relevant_sources(df, num_of_sources) + result = get_relevant_sources(df.get(),num_of_sources) relevant_sources_results.set(result) finally: ui.modal_remove() @@ -1636,7 +1641,7 @@ def table_relevant_sources(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, relevant_sources_tab = result - return ui.HTML(DT(relevant_sources_tab, style="width=100%;")) + return ui.HTML(DT(relevant_sources_tab, style="width:100%;")) # --- Most Local Cited Sources Section --- with ui.nav_panel("None", value="most_local_cited_sources"): @@ -1745,7 +1750,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: num_of_cited_sources = input.num_of_cited_sources() - result = get_local_cited_sources(df, num_of_cited_sources) + result = get_local_cited_sources(df.get(),num_of_cited_sources) local_cited_sources_results.set(result) finally: ui.modal_remove() @@ -1780,7 +1785,7 @@ def table_local_cited_sources(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_sources_tab = result - return ui.HTML(DT(local_cited_sources_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_sources_tab, style="width:100%;")) # --- Bradford's Law Section --- with ui.nav_panel("None", value="bradfords_law"): @@ -1821,7 +1826,7 @@ def show_bradfords_law_report(): with ui.card(full_screen=True): @reactive.calc def bradford_law(): - return get_bradford_law(df) + return get_bradford_law(df.get()) with ui.navset_underline(id="bradford_law_tab"): with ui.nav_panel("Plot"): @@ -1834,7 +1839,7 @@ def show_bradford_law(): @render.ui def table_bradford_law(): _, bradford_law_tab = bradford_law() - return ui.HTML(DT(bradford_law_tab, style="width=100%;")) + return ui.HTML(DT(bradford_law_tab, style="width:100%;")) # --- Sources' Local Impact Section --- with ui.nav_panel("None", value="sources_local_impact"): @@ -1945,7 +1950,7 @@ def loading_modal(): try: num_of_sources_local_impact = input.num_of_sources_local_impact() source_local_impact = input.source_local_impact() - result = get_sources_local_impact(df, num_of_sources_local_impact, source_local_impact) + result = get_sources_local_impact(df.get(),num_of_sources_local_impact, source_local_impact) sources_local_impact_results.set(result) finally: ui.modal_remove() @@ -1980,7 +1985,7 @@ def table_sources_local_impact(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, sources_local_impact_tab = result - return ui.HTML(DT(sources_local_impact_tab, style="width=100%;")) + return ui.HTML(DT(sources_local_impact_tab, style="width:100%;")) # --- Sources' Production --- with ui.nav_panel("None", value="sources_production"): @@ -2080,7 +2085,7 @@ def loading_modal(): try: num_of_sources_production = input.num_of_sources_production() occurences = input.occurences() - result = get_sources_production(df, num_of_sources_production, occurences) + result = get_sources_production(df.get(),num_of_sources_production, occurences) sources_production_result.set(result) finally: ui.modal_remove() @@ -2126,7 +2131,7 @@ def table_sources_production(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, sources_production_tab = result - return ui.HTML(DT(sources_production_tab, style="width=100%;")) + return ui.HTML(DT(sources_production_tab, style="width:100%;")) # --- Most Relevant Authors Section --- with ui.nav_panel("None", value="most_relevant_authors"): @@ -2227,7 +2232,7 @@ def loading_modal(): try: num_of_authors = input.num_of_authors() frequency = input.frequency() - result = get_relevant_authors(df, num_of_authors, frequency) + result = get_relevant_authors(df.get(), num_of_authors, frequency) relevant_authors_result.set(result) finally: ui.modal_remove() @@ -2273,7 +2278,7 @@ def table_relevant_authors(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, relevant_authors_tab = result - return ui.HTML(DT(relevant_authors_tab, style="width=100%;")) + return ui.HTML(DT(relevant_authors_tab, style="width:100%;")) # --- Most Local Cited Authors Section --- with ui.nav_panel("None", value="most_local_cited_authors"): @@ -2376,7 +2381,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: num_of_cited_authors = input.num_of_cited_authors() - result = get_local_cited_authors(df, num_of_cited_authors) + result = get_local_cited_authors(df.get(),num_of_cited_authors) local_cited_authors_result.set(result) finally: ui.modal_remove() @@ -2421,7 +2426,7 @@ def table_local_cited_authors(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_authors_tab = result - return ui.HTML(DT(local_cited_authors_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_authors_tab, style="width:100%;")) # --- Authors' Production over Time Section --- with ui.nav_panel("None", value="authors_production"): @@ -2521,7 +2526,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_authors = input.TopAuthorsProdK() - result = get_author_production_over_time(df, top_k_authors) + result = get_author_production_over_time(df.get(),top_k_authors) au_over_time_result.set(result) finally: ui.modal_remove() @@ -2566,7 +2571,7 @@ def table_authors_production(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, table_authors_production, _ = result - return ui.HTML(DT(table_authors_production, style="width=100%;")) + return ui.HTML(DT(table_authors_production, style="width:100%;")) with ui.nav_panel("Table - Documents"): @render.ui @@ -2584,7 +2589,7 @@ def table_documents(): table_documents['DOI'] = table_documents['DOI'].apply( lambda x: f'{x}' if x != "N/A" else x ) - return ui.HTML(DT(table_documents, style="width=100%;")) + return ui.HTML(DT(table_documents, style="width:100%;")) # AI bot Gemini Chat Integration # --- Floating Chat Button --- @render.express() @@ -2723,7 +2728,7 @@ def show_lotkas_law_report(): with ui.card(full_screen=True): @reactive.calc def lotka_law(): - return get_lotka_law(df) + return get_lotka_law(df.get()) with ui.navset_underline(id="lotka_law_tab"): with ui.nav_panel("Plot"): @@ -2736,7 +2741,7 @@ def show_lotka_law(): @render.ui def table_lotka_law(): _, lotka_law_tab = lotka_law() - return ui.HTML(DT(lotka_law_tab, style="width=100%;")) + return ui.HTML(DT(lotka_law_tab, style="width:100%;")) # --- Authors' Local Impact Section --- with ui.nav_panel("None", value="authors_local_impact"): @@ -2837,7 +2842,7 @@ def loading_modal(): try: num_of_authors_local_impact = input.num_of_authors_local_impact() author_local_impact = input.author_local_impact() - result = get_authors_local_impact(df, num_of_authors_local_impact, author_local_impact) + result = get_authors_local_impact(df.get(),num_of_authors_local_impact, author_local_impact) authors_local_impact_result.set(result) finally: ui.modal_remove() @@ -2883,7 +2888,7 @@ def table_authors_local_impact(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, authors_local_impact_tab = result - return ui.HTML(DT(authors_local_impact_tab, style="width=100%;")) + return ui.HTML(DT(authors_local_impact_tab, style="width:100%;")) # --- Most Relevant Affiliations Section --- with ui.nav_panel("None", value="most_relevant_affiliations"): @@ -2984,7 +2989,7 @@ def loading_modal(): try: num_of_affiliations = input.num_of_affiliations() disambiguation = input.disambiguation() - result = get_relevant_affiliations(df, num_of_affiliations, disambiguation) + result = get_relevant_affiliations(df.get(),num_of_affiliations, disambiguation) relevant_affiliations_result.set(result) finally: ui.modal_remove() @@ -3030,7 +3035,7 @@ def table_relevant_affiliations(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, relevant_affiliations_tab = result - return ui.HTML(DT(relevant_affiliations_tab, style="width=100%;")) + return ui.HTML(DT(relevant_affiliations_tab, style="width:100%;")) # --- Affiliations' Production over Time Section --- with ui.nav_panel("None", value="affiliations_production"): @@ -3137,7 +3142,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_affiliations = input.TopAffProdK() - result = get_affiliation_production_over_time(df, top_k_affiliations) + result = get_affiliation_production_over_time(df.get(),top_k_affiliations) affiliations_production_results.set(result) finally: ui.modal_remove() @@ -3172,7 +3177,7 @@ def table_affiliations_production(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, table_affiliations_production = result - return ui.HTML(DT(table_affiliations_production, style="width=100%;")) + return ui.HTML(DT(table_affiliations_production, style="width:100%;")) # --- Affiliations' Local Impact Section --- with ui.nav_panel("None", value="corresponding_authors"): @@ -3281,7 +3286,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_countries = input.TopCountries() - result = get_corresponding_author_countries(df, top_k_countries) + result = get_corresponding_author_countries(df.get(),top_k_countries) corresponding_authors_results.set(result) finally: ui.modal_remove() @@ -3316,7 +3321,7 @@ def table_countries_collaboration(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, countries_table = result - return ui.HTML(DT(countries_table, style="width=100%;")) + return ui.HTML(DT(countries_table, style="width:100%;")) # --- Countries' Scientific Production Section --- with ui.nav_panel("None", value="countries_scientific_production"): @@ -3406,7 +3411,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: - result = get_countries_production(df) + result = get_countries_production(df.get()) return result finally: ui.modal_remove() @@ -3422,7 +3427,7 @@ def show_countries_production(): @render.ui def table_countries_production(): _, countries_table = countries_production() - return ui.HTML(DT(countries_table, style="width=100%;")) + return ui.HTML(DT(countries_table, style="width:100%;")) # --- Countries' Production over Time Section --- with ui.nav_panel("None", value="countries_production_over_time"): @@ -3531,7 +3536,7 @@ def loading_modal(): ui.modal_show(loading_modal()) try: top_k_countries = input.TopCountriesProdK() - result = get_countries_production_over_time(df, top_k_countries) + result = get_countries_production_over_time(df.get(),top_k_countries) countries_over_time_results.set(result) finally: ui.modal_remove() @@ -3566,7 +3571,7 @@ def table_countries_over_time(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, countries_table = result - return ui.HTML(DT(countries_table, style="width=100%;")) + return ui.HTML(DT(countries_table, style="width:100%;")) # --- Most Cited Countries Section --- with ui.nav_panel("None", value="most_cited_countries"): @@ -3677,7 +3682,7 @@ def loading_modal(): try: num_of_cited_countries = input.num_of_cited_countries() cited_countries_measure = input.cited_countries() - result = get_cited_countries(df, num_of_cited_countries, cited_countries_measure) + result = get_cited_countries(df.get(),num_of_cited_countries, cited_countries_measure) cited_countries_results.set(result) finally: ui.modal_remove() @@ -3712,7 +3717,7 @@ def table_cited_countries(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, cited_countries_tab = result - return ui.HTML(DT(cited_countries_tab, style="width=100%;")) + return ui.HTML(DT(cited_countries_tab, style="width:100%;")) # --- Most Global Cited Documents Section --- with ui.nav_panel("None", value="most_global_cited_documents"): @@ -3817,7 +3822,7 @@ def loading_modal(): try: num_of_cited_docs = input.num_of_cited_docs() cited_docs = input.cited_docs() - result = get_cited_documents(df, num_of_cited_docs, cited_docs) + result = get_cited_documents(df.get(),num_of_cited_docs, cited_docs) cited_documents_results.set(result) finally: ui.modal_remove() @@ -3852,7 +3857,7 @@ def table_cited_documents(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, cited_documents_tab = result - return ui.HTML(DT(cited_documents_tab, style="width=100%;")) + return ui.HTML(DT(cited_documents_tab, style="width:100%;")) # --- Most Local Cited Documents Section --- with ui.nav_panel("None", value="most_local_cited_documents"): @@ -3964,7 +3969,7 @@ def loading_modal(): # Run analysis num_of_local_cited_docs = input.num_of_local_cited_docs() field_separator = input.field_separator() - result = get_local_cited_documents(df, num_of_local_cited_docs, field_separator) + result = get_local_cited_documents(df.get(),num_of_local_cited_docs, field_separator) local_cited_documents_results.set(result) finally: ui.modal_remove() @@ -3998,7 +4003,7 @@ def table_local_cited_documents(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_documents_tab = result - return ui.HTML(DT(local_cited_documents_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_documents_tab, style="width:100%;")) # --- Most Local Cited References Section --- with ui.nav_panel("None", value="most_local_cited_references"): @@ -4110,7 +4115,7 @@ def loading_modal(): # Run analysis num_of_cited_refs = input.num_of_cited_refs() field_separator_ref = input.field_separator_ref() - result = get_local_cited_refs(df, num_of_cited_refs, field_separator_ref) + result = get_local_cited_refs(df.get(),num_of_cited_refs, field_separator_ref) local_cited_refs_results.set(result) finally: ui.modal_remove() @@ -4144,7 +4149,7 @@ def table_local_cited_refs(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, local_cited_refs_tab = result - return ui.HTML(DT(local_cited_refs_tab, style="width=100%;")) + return ui.HTML(DT(local_cited_refs_tab, style="width:100%;")) # --- References Spectroscopy Section --- with ui.nav_panel("None", value="references_spectroscopy"): @@ -4260,7 +4265,7 @@ def loading_modal(): start_year = input.start_year() end_year = input.end_year() field_separator_spec = input.field_separator_spec() - result = get_references_spectroscopy(df, start_year, end_year, field_separator_spec) + result = get_references_spectroscopy(df.get(),start_year, end_year, field_separator_spec) ref_spectroscopy_results.set(result) finally: ui.modal_remove() @@ -4294,7 +4299,7 @@ def table_references_rpy(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, ref_rpy_tab, _ = result - return ui.HTML(DT(ref_rpy_tab, style="width=100%;")) + return ui.HTML(DT(ref_rpy_tab, style="width:100%;")) with ui.nav_panel("Table - Cited References"): @render.ui @@ -4306,7 +4311,7 @@ def table_references_spectroscopy(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, _, ref_spectroscopy_tab = result - return ui.HTML(DT(ref_spectroscopy_tab, style="width=100%;")) + return ui.HTML(DT(ref_spectroscopy_tab, style="width:100%;")) # --- Most Frequent Words --- with ui.nav_panel("None", value="most_frequent_words"): @@ -4470,7 +4475,7 @@ def loading_modal(): file_upload_synonyms_mfw = None synonyms_data_mfw = None - result = get_frequent_words(df, ngram_mfw, num_of_words_mfw, field_mfw, file_upload_terms_mfw, file_upload_synonyms_mfw) + result = get_frequent_words(df.get(),ngram_mfw, num_of_words_mfw, field_mfw, file_upload_terms_mfw, file_upload_synonyms_mfw) frequent_words_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -4524,7 +4529,7 @@ def table_frequent_words(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, frequent_words_tab = result - return ui.HTML(DT(frequent_words_tab, style="width=100%;")) + return ui.HTML(DT(frequent_words_tab, style="width:100%;")) # --- WordCloud Section --- with ui.nav_panel("None", value="wordcloud"): @@ -4688,7 +4693,7 @@ def loading_modal(): file_upload_synonyms_wc = None synonyms_data_wc = None - result = get_wordcloud(df, ngram_wc, num_of_words_wc, field_wc, file_upload_terms_wc, file_upload_synonyms_wc) + result = get_wordcloud(df.get(),ngram_wc, num_of_words_wc, field_wc, file_upload_terms_wc, file_upload_synonyms_wc) wordcloud_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -4742,7 +4747,7 @@ def table_wordcloud(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, wordcloud_tab = result - return ui.HTML(DT(wordcloud_tab, style="width=100%;")) + return ui.HTML(DT(wordcloud_tab, style="width:100%;")) # --- TreeMap Section --- with ui.nav_panel("None", value="treemap"): @@ -4906,7 +4911,7 @@ def loading_modal(): file_upload_synonyms_tm = None synonyms_data_tm = None - result = get_treemap(df, ngram_tm, num_of_words_tm, field_tm, file_upload_terms_tm, file_upload_synonyms_tm) + result = get_treemap(df.get(),ngram_tm, num_of_words_tm, field_tm, file_upload_terms_tm, file_upload_synonyms_tm) treemap_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -4960,7 +4965,7 @@ def table_treemap(): style="height: 400px; display: flex; flex-direction: column; justify-content: center; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" ) _, treemap_tab = result - return ui.HTML(DT(treemap_tab, style="width=100%;")) + return ui.HTML(DT(treemap_tab, style="width:100%;")) # --- References Spectroscopy Section --- with ui.nav_panel("None", value="words_frequency_over_time"): @@ -5127,7 +5132,7 @@ def loading_modal(): file_upload_synonyms_wf = None synonyms_data_wf = None - result = get_word_frequency(df, ngram_wf, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words) + result = get_word_frequency(df.get(),ngram_wf, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words) word_frequency_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -5357,7 +5362,7 @@ def loading_modal(): word_mimimum_frequency = input.word_mimimum_frequency() number_of_words_year = input.number_of_words_year() - result = get_trend_topics(df, ngram_tt, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_mimimum_frequency, number_of_words_year) + result = get_trend_topics(df.get(),ngram_tt, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_mimimum_frequency, number_of_words_year) trend_topics_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -5561,7 +5566,7 @@ def loading_modal(): community_repulsion = input.community_repulsion() clustering_algorithm = input.clustering_algorithm() - result = get_clustering_coupling(df, unit_of_analysis, coupling_field, stemmer, impact_measure, cluster_labeling, ngram, num_of_units, min_cluster_freq, label_per_cluster, label_size, community_repulsion, clustering_algorithm) + result = get_clustering_coupling(df.get(),unit_of_analysis, coupling_field, stemmer, impact_measure, cluster_labeling, ngram, num_of_units, min_cluster_freq, label_per_cluster, label_size, community_repulsion, clustering_algorithm) clustering_coupling_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -5848,7 +5853,7 @@ def loading_modal(): modal_content.append(ui.markdown("""

Synonyms to Remove

""")) modal_content.append(ui.HTML(DT(synonyms_data))) - result = get_co_occurence_network(df, field_cn, ngram_cn, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes, + result = get_co_occurence_network(df.get(),field_cn, ngram_cn, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes, repulsion_force, remove_isolated, min_edges, node_opacity, num_of_labels, node_shape, label_size_ls, edge_size, node_shadow, edit_nodes, label_cex, file_upload_terms, file_upload_synonyms) co_occurrence_network_results.set(result) @@ -5895,7 +5900,7 @@ def table_co_occurrence_network(): result = co_occurrence_network_results.get() if result is not None: _, _, co_occurrence_network_tab, _ = result - return ui.HTML(DT(co_occurrence_network_tab, style="width=100%;")) + return ui.HTML(DT(co_occurrence_network_tab, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run co-occurrence network", style="text-align: center; color: #999; font-size: 16px;"), @@ -6068,7 +6073,7 @@ def loading_modal(): cluster = input.thematic_clustering() repulsion = input.thematic_repulsion() - result = get_thematic_map(df, field, n, minfreq, ngram, stemming, + result = get_thematic_map(df.get(),field, n, minfreq, ngram, stemming, label_size, n_labels, repulsion, cluster) thematic_map_results.set(result) except Exception as e: @@ -6116,7 +6121,7 @@ def table_thematic_map(): result = thematic_map_results.get() if result is not None: _, _, thematic_map_table, _, _ = result - return ui.HTML(DT(thematic_map_table, style="width=100%;")) + return ui.HTML(DT(thematic_map_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"), @@ -6129,7 +6134,7 @@ def clusters_thematic_map(): result = thematic_map_results.get() if result is not None: _, _, _, thematic_map_cluster, _ = result - return ui.HTML(DT(thematic_map_cluster, style="width=100%;")) + return ui.HTML(DT(thematic_map_cluster, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"), @@ -6142,7 +6147,7 @@ def documents_thematic_map(): result = thematic_map_results.get() if result is not None: _, _, _, _, thematic_map_documents = result - return ui.HTML(DT(thematic_map_documents, maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(thematic_map_documents, maxBytes="10MB", style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic map", style="text-align: center; color: #999; font-size: 16px;"), @@ -6403,7 +6408,7 @@ def loading_modal(): ngrams = input.thematic_evolution_ngram() if field in ["TI", "AB"] else 1 stemming = input.thematic_evolution_stemmer() if field in ["TI", "AB"] else False - result = get_thematic_evolution(df, field, years, n, weight_index, min_weight_index, minfreq, label_size, ngrams, stemming, n_labels, overlap, remove_terms, synonyms, cluster) + result = get_thematic_evolution(df.get(),field, years, n, weight_index, min_weight_index, minfreq, label_size, ngrams, stemming, n_labels, overlap, remove_terms, synonyms, cluster) thematic_evolution_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -6444,7 +6449,7 @@ def table_thematic_evolution(): result = thematic_evolution_results.get() if result is not None: _, thematic_evolution_table, _ = result - return ui.HTML(DT(thematic_evolution_table, style="width=100%;")) + return ui.HTML(DT(thematic_evolution_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), @@ -6483,7 +6488,7 @@ def table_thematic_evolution_2(): if result is not None: _, _, TM = result if len(TM) > 0: - return ui.HTML(DT(TM[0]["words"], style="width=100%;")) + return ui.HTML(DT(TM[0]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6496,7 +6501,7 @@ def clusters_thematic_evolution_2(): if result is not None: _, _, TM = result if len(TM) > 0: - return ui.HTML(DT(TM[0]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[0]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6509,7 +6514,7 @@ def documents_thematic_evolution_2(): if result is not None: _, _, TM = result if len(TM) > 0: - return ui.HTML(DT(TM[0]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[0]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6547,7 +6552,7 @@ def table_thematic_evolution_3(): if result is not None: _, _, TM = result if len(TM) > 1: - return ui.HTML(DT(TM[1]["words"], style="width=100%;")) + return ui.HTML(DT(TM[1]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6560,7 +6565,7 @@ def clusters_thematic_evolution_3(): if result is not None: _, _, TM = result if len(TM) > 1: - return ui.HTML(DT(TM[1]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[1]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6573,7 +6578,7 @@ def documents_thematic_evolution_3(): if result is not None: _, _, TM = result if len(TM) > 1: - return ui.HTML(DT(TM[1]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[1]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6611,7 +6616,7 @@ def table_thematic_evolution_4(): if result is not None: _, _, TM = result if len(TM) > 2: - return ui.HTML(DT(TM[2]["words"], style="width=100%;")) + return ui.HTML(DT(TM[2]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6624,7 +6629,7 @@ def clusters_thematic_evolution_4(): if result is not None: _, _, TM = result if len(TM) > 2: - return ui.HTML(DT(TM[2]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[2]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6637,7 +6642,7 @@ def documents_thematic_evolution_4(): if result is not None: _, _, TM = result if len(TM) > 2: - return ui.HTML(DT(TM[2]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[2]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6675,7 +6680,7 @@ def table_thematic_evolution_5(): if result is not None: _, _, TM = result if len(TM) > 3: - return ui.HTML(DT(TM[3]["words"], style="width=100%;")) + return ui.HTML(DT(TM[3]["words"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6688,7 +6693,7 @@ def clusters_thematic_evolution_5(): if result is not None: _, _, TM = result if len(TM) > 3: - return ui.HTML(DT(TM[3]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[3]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6701,7 +6706,7 @@ def documents_thematic_evolution_5(): if result is not None: _, _, TM = result if len(TM) > 3: - return ui.HTML(DT(TM[3]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[3]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6739,7 +6744,7 @@ def table_thematic_evolution_6(): if result is not None: _, _, TM = result if len(TM) > 4: - return ui.HTML(DT(TM[4]["words"]), style="width=100%;") + return ui.HTML(DT(TM[4]["words"]), style="width:100%;") return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6752,7 +6757,7 @@ def clusters_thematic_evolution_6(): if result is not None: _, _, TM = result if len(TM) > 4: - return ui.HTML(DT(TM[4]["clusters"], style="width=100%;")) + return ui.HTML(DT(TM[4]["clusters"], style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6765,7 +6770,7 @@ def documents_thematic_evolution_6(): if result is not None: _, _, TM = result if len(TM) > 4: - return ui.HTML(DT(TM[4]["documentToClusters"], maxBytes="10MB", style="width=100%;")) + return ui.HTML(DT(TM[4]["documentToClusters"], maxBytes="10MB", style="width:100%;")) return ui.div( ui.p("Click the Run Analysis button to run thematic evolution", style="text-align: center; color: #999; font-size: 16px;"), style="display: flex; flex-direction: column; justify-content: center; align-items: center; height: 300px; border: 2px dashed #ddd; border-radius: 10px; margin: 20px;" @@ -6995,7 +7000,7 @@ def loading_modal(): labelsize=input.wordmap_labelsize() size=input.wordmap_dot_size() - result = get_factorial_analysis(df, ngram, field, terms_data_wm, synonyms_data_wm, n_terms, n_clusters, num_documents, method, dimX, dimY, topWordPlot, threshold, labelsize, size) + result = get_factorial_analysis(df.get(),ngram, field, terms_data_wm, synonyms_data_wm, n_terms, n_clusters, num_documents, method, dimX, dimY, topWordPlot, threshold, labelsize, size) factorial_analysis_results.set(result) except Exception as e: ui.notification_show(f"❌ Error in analysis: {str(e)}", type="error", duration=10) @@ -7051,7 +7056,7 @@ def show_words_by_cluster(): result = factorial_analysis_results.get() if result is not None: _, _, words_by_cluster, _ = result - return ui.HTML(DT(words_by_cluster, style="width=100%;")) + return ui.HTML(DT(words_by_cluster, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run factorial analysis", style="text-align: center; color: #999; font-size: 16px;"), @@ -7064,7 +7069,7 @@ def show_articles_by_cluster(): result = factorial_analysis_results.get() if result is not None: _, _, _, articles_by_cluster = result - return ui.HTML(DT(articles_by_cluster, style="width=100%;")) + return ui.HTML(DT(articles_by_cluster, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to run factorial analysis", style="text-align: center; color: #999; font-size: 16px;"), @@ -7345,7 +7350,7 @@ def show_cocitation_table(): result = co_citation_network_results.get() if result is not None: _, _, cocit_table, _ = result - return ui.HTML(DT(cocit_table, style="width=100%;")) + return ui.HTML(DT(cocit_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the co-citation table.", style="text-align: center; color: #666; font-size: 16px;"), @@ -7560,7 +7565,7 @@ def show_hist_table(): result = historiograph_results.get() if result is not None: _, hist_tab, _ = result - return ui.HTML(DT(hist_tab, style="width=100%;")) + return ui.HTML(DT(hist_tab, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the historiograph table.", style="text-align: center; color: #666; font-size: 16px;"), @@ -7865,7 +7870,7 @@ def show_collaboration_table(): result = collaboration_network_results.get() if result is not None: _, _, collab_table, _ = result - return ui.HTML(DT(collab_table, style="width=100%;")) + return ui.HTML(DT(collab_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the collaboration table.", style="text-align: center; color: #666; font-size: 16px;"), @@ -8045,7 +8050,7 @@ def show_world_map_collaboration_table(): result = countries_collaboration_network_results.get() if result is not None: _, world_map_table = result - return ui.HTML(DT(world_map_table, style="width=100%;")) + return ui.HTML(DT(world_map_table, style="width:100%;")) else: return ui.div( ui.p("Click the Run Analysis button to generate the world map collaboration table.", style="text-align: center; color: #666; font-size: 16px;"), diff --git a/demo_etl_pipeline.ipynb b/demo_etl_pipeline.ipynb new file mode 100644 index 000000000..6c45c60c2 --- /dev/null +++ b/demo_etl_pipeline.ipynb @@ -0,0 +1,674 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "12ef2c5b-d603-48eb-a551-a5e36def9c2d", + "metadata": {}, + "source": [ + "# Bibliometrix ETL Pipeline - Demo Notebook\n", + "## From Heterogeneous Bibliographic Data to a Unified Schema\n", + "\n", + "This notebook demonstrates the full ETL pipeline developed for the Bibliometrix-Python project. The pipeline standardizes bibliographic data from multiple sources (Scopus, PubMed, OpenAlex) into a unified Web of Science-compatible schema.\n", + "\n", + "### Pipeline Architecture\n", + "- **Extract**: Load data from local files or REST APIs\n", + "- **Transform**: Rename columns, enforce types, handle nulls, calculate SR\n", + "- **Validate**: Check schema, types, and null values before analysis" + ] + }, + { + "cell_type": "markdown", + "id": "d081bd6b-c2ee-4fed-a5c9-8e832ac3f85e", + "metadata": {}, + "source": [ + "## 1. Setup and Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "edabf0ee-54f2-4840-974d-be5f0c6885f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All modules imported successfully!\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.insert(0, r\"C:\\Users\\mlosc\\bibliometrix-python\")\n", + "import pandas as pd\n", + "from www.services.etl.transformer import transform\n", + "from www.services.etl.validator import validate\n", + "from www.services.etl.api_retriever import retrieve_openalex, retrieve_pubmed\n", + "from www.services.etl.mappings import SCOPUS_CSV_MAPPING\n", + "print(\"All modules imported successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "ab5a1dbb-93ad-40a5-9e73-5ee7efdc2fa4", + "metadata": {}, + "source": [ + "## 2. Base Level - Loading a Scopus CSV File\n", + "In this section we demonstrate the BASE LEVEL of the pipeline.\n", + "We load a manually exported CSV file from Scopus and standardized it." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e8046794-78ff-4348-b19c-b84b63523845", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Raw shape: (20, 45)\n", + "\n", + "Raw column names (Scopus format):\n", + "['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end', 'Cited by', 'DOI', 'Link', 'Affiliations', 'Authors with affiliations', 'Abstract', 'Author Keywords', 'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS', 'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts', 'References', 'Correspondence Address', 'Editors', 'Publisher', 'Sponsors', 'Conference name', 'Conference date', 'Conference location', 'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', 'Language of Original Document', 'Abbreviated Source Title', 'Document Type', 'Publication Stage', 'Open Access', 'Source', 'EID']\n" + ] + } + ], + "source": [ + "df_raw = pd.read_csv(\"scopus_test.csv\",encoding=\"utf-8\")\n", + "print(f\"Raw shape: {df_raw.shape}\")\n", + "print(f\"\\nRaw column names (Scopus format):\")\n", + "print(df_raw.columns.tolist())" + ] + }, + { + "cell_type": "markdown", + "id": "8a2faa93-e9fc-493e-b58a-2eb92af18b4f", + "metadata": {}, + "source": [ + "## 2.1 Transform - Applying the ETL Pipeline\n", + "We apply the mapping dictionary to rename columns to WoS tags, enforce correct data types, fill missing values, and calculate the SR field." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "de137c4b-0f66-4bd4-b3c2-09cbd123e51f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TRANSFORM] Starting transformation for source: SCOPUS\n", + "[TRANSFORM] Columns renamed.\n", + "[TRANSFORM] List columns enforced\n", + "[TRANSFORM] Integer columns enforced.\n", + "[TRANSFORM] Missing columns filled.\n", + "[TRANSFORM] Null values filled.\n", + "[TRANSFORM] SR field calculated.\n", + "[TRANSFORM] Done. Shape: (20, 47).\n", + "\n", + "Standardized shape: (20, 47)\n", + "\n", + "Standardized column names (WoS format):\n", + "['AU', 'AF', 'Author(s) ID', 'TI', 'PY', 'SO', 'VL', 'IS', 'Art. No.', 'BP', 'EP', 'TC', 'DI', 'Link', 'C1', 'Authors with affiliations', 'AB', 'DE', 'ID', 'Molecular Sequence Numbers', 'Chemicals/CAS', 'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts', 'CR', 'RP', 'Editors', 'Publisher', 'Sponsors', 'Conference name', 'Conference date', 'Conference location', 'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PMID', 'LA', 'JI', 'DT', 'Publication Stage', 'Open Access', 'Source', 'UT', 'DB', 'SR']\n" + ] + } + ], + "source": [ + "df_scopus = transform(df_raw, SCOPUS_CSV_MAPPING, \"SCOPUS\")\n", + "print(f\"\\nStandardized shape: {df_scopus.shape}\")\n", + "print(f\"\\nStandardized column names (WoS format):\")\n", + "print(df_scopus.columns.tolist())" + ] + }, + { + "cell_type": "markdown", + "id": "7f7f7c73-eb45-4f5d-ab46-d6385369cfc0", + "metadata": {}, + "source": [ + "## 2.2 Validate - Checking the Standardized DataFrame\n", + "The validator checks thaat all mandatory columns are present, no null values remain, and list columns are correctly typed.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e218f9b8-793b-4ce9-a0cf-a94361b2b9c2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "---Running ETL Validation---\n", + "[OK] All mandatory columns are present.\n", + "[OK] No null values found.\n", + "[OK] All list columns are correctly typed.\n", + "---Validation PASSED---\n", + "\n", + "Sample of key standardized columns:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TIAUPYSOTCSR
0Investigation on the suppression of Gas-Coaldu...[Bin L., Jinzhang J., YangQiang, Dongming W., ...2027Fuel0Bin L., 2027, Fuel
1Impurity effects on CO2 trapping indices: A nu...[Alkhowaildi M., Tariq Z., AlTammar M.J., Hote...2027Fuel0Alkhowaildi M., 2027, Fuel
2Fully elucidating catalyst-driven combustion m...[Wen M., Han J., Zhang X., Zhao Y., Zhang Y., ...2027Fuel0Wen M., 2027, Fuel
3Wettability of geological formations in a CO2 ...[Aboushanab M., Arif M.]2027Fuel0Aboushanab M., 2027, Fuel
4Physics-informed dual integration machine lear...[Zhang M., Zhu W., Mao T., Cao J., Meng X., Bi...2027Fuel0Zhang M., 2027, Fuel
\n", + "
" + ], + "text/plain": [ + " TI \\\n", + "0 Investigation on the suppression of Gas-Coaldu... \n", + "1 Impurity effects on CO2 trapping indices: A nu... \n", + "2 Fully elucidating catalyst-driven combustion m... \n", + "3 Wettability of geological formations in a CO2 ... \n", + "4 Physics-informed dual integration machine lear... \n", + "\n", + " AU PY SO TC \\\n", + "0 [Bin L., Jinzhang J., YangQiang, Dongming W., ... 2027 Fuel 0 \n", + "1 [Alkhowaildi M., Tariq Z., AlTammar M.J., Hote... 2027 Fuel 0 \n", + "2 [Wen M., Han J., Zhang X., Zhao Y., Zhang Y., ... 2027 Fuel 0 \n", + "3 [Aboushanab M., Arif M.] 2027 Fuel 0 \n", + "4 [Zhang M., Zhu W., Mao T., Cao J., Meng X., Bi... 2027 Fuel 0 \n", + "\n", + " SR \n", + "0 Bin L., 2027, Fuel \n", + "1 Alkhowaildi M., 2027, Fuel \n", + "2 Wen M., 2027, Fuel \n", + "3 Aboushanab M., 2027, Fuel \n", + "4 Zhang M., 2027, Fuel " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = validate(df_scopus)\n", + "print(\"\\nSample of key standardized columns:\")\n", + "df_scopus[[\"TI\",\"AU\",\"PY\",\"SO\",\"TC\",\"SR\"]].head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "0d067e7a-db8d-4a8e-ba22-c6e12b7c07ea", + "metadata": {}, + "source": [ + "## 3. Advanced Level - Retrieving Data via API\n", + "Here we demonstrate the ADVANCED LEVEL of the pipeline.\n", + "Data is retrieved automatically from OpenAlex and PubMed REST APIs using a simple text query, with no manual download required." + ] + }, + { + "cell_type": "markdown", + "id": "244029a4-cac5-43b6-864f-e275e436d198", + "metadata": {}, + "source": [ + "### 3.1 OpenAlex API" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b8872656-19b5-4461-9f55-620bed659f65", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[OpenAlex] Searching for: 'machine learning' (max 10 results)\n", + "[OpenAlex] Page 1: retrieved 25 records. Total so far: 10\n", + "[OpenAlex] Done. Total records retrieved: 10\n", + "\n", + "Retrieved shape: (10, 23)\n", + "\n", + "First 3 titles:\n", + " - Scikit-learn: Machine Learning in Python\n", + " - Genetic algorithms in search, optimization, and machine learning\n", + " - C4.5: Programs for Machine Learning\n" + ] + } + ], + "source": [ + "df_openalex_raw = retrieve_openalex(query=\"machine learning\", max_results=10)\n", + "print(f\"\\nRetrieved shape: {df_openalex_raw.shape}\")\n", + "print(f\"\\nFirst 3 titles:\")\n", + "for title in df_openalex_raw[\"TI\"].head(3):\n", + " print(f\" - {title}\")" + ] + }, + { + "cell_type": "markdown", + "id": "c9114131-27f9-46b8-a897-fd8ca583621d", + "metadata": {}, + "source": [ + "### 3.2 Transform and Validate OpenAlex data" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "1e1622a4-6787-4d30-b8a7-7073bdf097bd", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TRANSFORM] Starting transformation for source: OPENALEX\n", + "[TRANSFORM] Columns renamed.\n", + "[TRANSFORM] List columns enforced\n", + "[TRANSFORM] Integer columns enforced.\n", + "[TRANSFORM] Missing columns filled.\n", + "[TRANSFORM] Null values filled.\n", + "[TRANSFORM] SR field calculated.\n", + "[TRANSFORM] Done. Shape: (10, 24).\n", + "---Running ETL Validation---\n", + "[OK] All mandatory columns are present.\n", + "[OK] No null values found.\n", + "[OK] All list columns are correctly typed.\n", + "---Validation PASSED---\n", + "\n", + "Sample of key standardized columns:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TIAUPYSOTCSR
0Scikit-learn: Machine Learning in Python[]201263678, 2012,
1Genetic algorithms in search, optimization, an...[]198949333, 1989,
2C4.5: Programs for Machine Learning[]199223696, 1992,
\n", + "
" + ], + "text/plain": [ + " TI AU PY SO TC \\\n", + "0 Scikit-learn: Machine Learning in Python [] 2012 63678 \n", + "1 Genetic algorithms in search, optimization, an... [] 1989 49333 \n", + "2 C4.5: Programs for Machine Learning [] 1992 23696 \n", + "\n", + " SR \n", + "0 , 2012, \n", + "1 , 1989, \n", + "2 , 1992, " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_openalex = transform(df_openalex_raw, {}, \"OPENALEX\")\n", + "result = validate(df_openalex)\n", + "print(\"\\nSample of key standardized columns:\")\n", + "df_openalex[[\"TI\",\"AU\",\"PY\",\"SO\",\"TC\",\"SR\"]].head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "356be294-611c-4e12-b362-14865b3c205f", + "metadata": {}, + "source": [ + "### 3.3 PubMed API" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "bcec90db-76aa-4fcf-a5a8-8d363742d1b6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[PubMed] Searching for: 'machine learning' (max 10 results)\n", + "[PubMed] Found 10 IDs.\n", + "[PubMed] Fetched batch 1.Total so far: 10\n", + "[PubMed] Done. Total records retrieved: 10\n", + "\n", + "Retrieve shape: (10, 23)\n", + "\n", + "First 3 titles:\n", + " - Prediction of an fMRI-based schizophrenia biomarker from EEG using dynamic\n", + " - fNIRS Single-trial decoding improves systematically with higher optode density,\n", + " - Comprehensive analysis of m6A RNA methylation regulators and the immune\n" + ] + } + ], + "source": [ + "df_pubmed_raw = retrieve_pubmed(query=\"machine learning\", max_results=10)\n", + "print(f\"\\nRetrieve shape: {df_pubmed_raw.shape}\")\n", + "print(f\"\\nFirst 3 titles:\")\n", + "for title in df_pubmed_raw[\"TI\"].head(3):\n", + " print(f\" - {title}\")" + ] + }, + { + "cell_type": "markdown", + "id": "7d0e6149-a384-4b49-a9d5-1034c6f96051", + "metadata": {}, + "source": [ + "### 3.4 Transform and Validate PubMed data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "86ba80c8-d695-4781-a081-b581b9a8b317", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[TRANSFORM] Starting transformation for source: PUBMED\n", + "[TRANSFORM] Columns renamed.\n", + "[TRANSFORM] List columns enforced\n", + "[TRANSFORM] Integer columns enforced.\n", + "[TRANSFORM] Missing columns filled.\n", + "[TRANSFORM] Null values filled.\n", + "[TRANSFORM] SR field calculated.\n", + "[TRANSFORM] Done. Shape: (10, 24).\n", + "---Running ETL Validation---\n", + "[OK] All mandatory columns are present.\n", + "[OK] No null values found.\n", + "[OK] All list columns are correctly typed.\n", + "---Validation PASSED---\n", + "n\\Sample of key standardized columns:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TIAUPYSOTCSR
0Prediction of an fMRI-based schizophrenia biom...[Tamano R, Ogawa T, Katagiri A, Cai C, Kawanab...2026Biomedical physics & engineering express0Tamano R, 2026, Biomedical physics & engineeri...
1fNIRS Single-trial decoding improves systemati...[Fischer T, Middell E, Moradi S, von Luhmann A]2026Journal of neural engineering0Fischer T, 2026, Journal of neural engineering
2Comprehensive analysis of m6A RNA methylation ...[Liu X, Hu J, Shi G, Zhu W, Hao Q]2026Frontiers in neurology0Liu X, 2026, Frontiers in neurology
\n", + "
" + ], + "text/plain": [ + " TI \\\n", + "0 Prediction of an fMRI-based schizophrenia biom... \n", + "1 fNIRS Single-trial decoding improves systemati... \n", + "2 Comprehensive analysis of m6A RNA methylation ... \n", + "\n", + " AU PY \\\n", + "0 [Tamano R, Ogawa T, Katagiri A, Cai C, Kawanab... 2026 \n", + "1 [Fischer T, Middell E, Moradi S, von Luhmann A] 2026 \n", + "2 [Liu X, Hu J, Shi G, Zhu W, Hao Q] 2026 \n", + "\n", + " SO TC \\\n", + "0 Biomedical physics & engineering express 0 \n", + "1 Journal of neural engineering 0 \n", + "2 Frontiers in neurology 0 \n", + "\n", + " SR \n", + "0 Tamano R, 2026, Biomedical physics & engineeri... \n", + "1 Fischer T, 2026, Journal of neural engineering \n", + "2 Liu X, 2026, Frontiers in neurology " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pubmed = transform(df_pubmed_raw, {}, \"PUBMED\")\n", + "result = validate(df_pubmed)\n", + "print(\"n\\Sample of key standardized columns:\")\n", + "df_pubmed[[\"TI\",\"AU\",\"PY\",\"SO\",\"TC\",\"SR\"]].head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "1682906a-734a-436d-a702-196c1251835b", + "metadata": {}, + "source": [ + "## 4. Exporting the Standardized DataFrame to CSV\n", + "The standardized DataFrame can be exported to CSV for use with the Bibliometrix-Python analytical functions." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "4c63ab59-a3ec-4a1d-96ec-71302f1eaaf7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Scopus standardized CSV saved: scopus_standardized.csv\n", + "OpenAlex standardized CSV saved: openalex_standardized.csv\n", + "PubMed standardized CSV saved: pubmed_standardized.csv\n", + "\n", + "All files exported successfully!\n" + ] + } + ], + "source": [ + "df_scopus.to_csv(\"scopus_standardized.csv\", index=False)\n", + "print(\"Scopus standardized CSV saved: scopus_standardized.csv\")\n", + "df_openalex.to_csv(\"openalex_standardized.csv\", index=False)\n", + "print(\"OpenAlex standardized CSV saved: openalex_standardized.csv\")\n", + "df_pubmed.to_csv(\"pubmed_standardized.csv\", index=False)\n", + "print(\"PubMed standardized CSV saved: pubmed_standardized.csv\")\n", + "print(\"\\nAll files exported successfully!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "afa6b4a8-8275-43c9-af7d-9398a1cd117b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/functions/__init__.py b/functions/__init__.py index 20e24de36..f778f9d49 100644 --- a/functions/__init__.py +++ b/functions/__init__.py @@ -1,43 +1,43 @@ -from .get_affiliationproductionovertime import * -from .get_annualproduction import * -from .get_authorlocalimpact import * -from .get_authorproductionovertime import * -from .get_averagecitations import * -from .get_bradfordlaw import * -from .get_citedcountries import * -from .get_citeddocuments import * -from .get_clusteringcoupling import * -from .get_correspondingauthorcountries import * -from .get_countriesproduction import * -from .get_countriesproductionovertime import * -from .get_data import * -from .get_database import * -from .get_filters import * -from .get_frequentwords import * -from .get_localcitedauthors import * -from .get_localciteddocuments import * -from .get_localcitedreferences import * -from .get_localcitedsources import * -from .get_lotkalaw import * -from .get_maininformations import * -from .get_referencesspectroscopy import * -from .get_relevantaffiliations import * -from .get_relevantauthors import * -from .get_relevantsources import * -from .get_sourceslocalimpact import * -from .get_sourcesproduction import * -from .get_status import * -from .get_table import * -from .get_threefieldplot import * -from .get_treemap import * -from .get_trendtopics import * -from .get_wordcloud import * -from .get_wordfrequency import * -from .get_co_occurence_network import * -from .get_thematicmap import * -from .get_factorialanalysis import * -from .get_historiograph import * -from .get_thematicevolution import * -from .get_cocitation import * -from .get_collaborationnetwork import * -from .get_worldmapcollaboration import * \ No newline at end of file +from .get_affiliationproductionovertime import get_affiliation_production_over_time +from .get_annualproduction import get_annual_production +from .get_authorlocalimpact import get_authors_local_impact +from .get_authorproductionovertime import get_author_production_over_time +from .get_averagecitations import get_average_citations +from .get_bradfordlaw import get_bradford_law +from .get_citedcountries import get_cited_countries +from .get_citeddocuments import get_cited_documents +from .get_clusteringcoupling import get_clustering_coupling +from .get_cocitation import get_co_citation +from .get_collaborationnetwork import get_collaboration_network +from .get_correspondingauthorcountries import get_corresponding_author_countries +from .get_countriesproduction import get_countries_production +from .get_countriesproductionovertime import get_countries_production_over_time +from .get_co_occurence_network import get_co_occurence_network +from .get_data import get_data +from .get_database import get_database +from .get_factorialanalysis import get_factorial_analysis +from .get_filters import get_filters +from .get_frequentwords import get_frequent_words +from .get_historiograph import get_historiograph +from .get_localcitedauthors import get_local_cited_authors +from .get_localciteddocuments import get_local_cited_documents +from .get_localcitedreferences import get_local_cited_refs +from .get_localcitedsources import get_local_cited_sources +from .get_lotkalaw import get_lotka_law +from .get_maininformations import get_main_informations +from .get_referencesspectroscopy import get_references_spectroscopy +from .get_relevantaffiliations import get_relevant_affiliations +from .get_relevantauthors import get_relevant_authors +from .get_relevantsources import get_relevant_sources +from .get_sourceslocalimpact import get_sources_local_impact +from .get_sourcesproduction import get_sources_production +from .get_status import get_status +from .get_table import get_table +from .get_thematicevolution import get_thematic_evolution +from .get_thematicmap import get_thematic_map +from .get_threefieldplot import get_three_field_plot +from .get_treemap import get_treemap +from .get_trendtopics import get_trend_topics +from .get_wordcloud import get_wordcloud +from .get_wordfrequency import get_word_frequency +from .get_worldmapcollaboration import get_world_map_collaboration \ No newline at end of file diff --git a/functions/get_affiliationproductionovertime.py b/functions/get_affiliationproductionovertime.py index e1b87f583..241c54855 100644 --- a/functions/get_affiliationproductionovertime.py +++ b/functions/get_affiliationproductionovertime.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_affiliation_production_over_time(df, top_k_affiliations): @@ -12,7 +22,7 @@ def get_affiliation_production_over_time(df, top_k_affiliations): Returns: A Plotly figure object representing the affiliation's production over time. """ - data = df.get() + data = df.copy() AFF = data["AU_UN"].dropna().apply(lambda x: [aff for aff in x if aff.strip() != ""]) nAFF = [len(aff) for aff in AFF] diff --git a/functions/get_annualproduction.py b/functions/get_annualproduction.py index dd27105c2..cf0416a08 100644 --- a/functions/get_annualproduction.py +++ b/functions/get_annualproduction.py @@ -1,4 +1,15 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go + def get_annual_production(df): @@ -11,7 +22,7 @@ def get_annual_production(df): Returns: A Plotly figure object representing the annual scientific production. """ - data = df.get() + data = df.copy() # Calculate the number of publications per year publications_per_year = data["PY"].value_counts().sort_index().reset_index() diff --git a/functions/get_authorlocalimpact.py b/functions/get_authorlocalimpact.py index 74a68e263..6b98005b0 100644 --- a/functions/get_authorlocalimpact.py +++ b/functions/get_authorlocalimpact.py @@ -1,4 +1,15 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +import numpy as np def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impact): @@ -13,7 +24,7 @@ def get_authors_local_impact(df, num_of_authors_local_impact, author_local_impac Returns: A Plotly figure object and a DataFrame of the most impactful sources. """ - df = df.get() + df = df.copy() today = pd.Timestamp.now().year # Ensure 'TC' and 'PY' are numeric diff --git a/functions/get_authorproductionovertime.py b/functions/get_authorproductionovertime.py index 65edaca96..ca998725a 100644 --- a/functions/get_authorproductionovertime.py +++ b/functions/get_authorproductionovertime.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_author_production_over_time(df, top_k_authors): @@ -16,7 +26,7 @@ def get_author_production_over_time(df, top_k_authors): table_authors_production (pd.DataFrame): Table summarizing authors' production with TC and TCpY. table_documents (pd.DataFrame): Detailed table with additional document information. """ - data = df.get() + data = df.copy() # Ensure "PY" is numeric data["PY"] = pd.to_numeric(data["PY"], errors="coerce") diff --git a/functions/get_averagecitations.py b/functions/get_averagecitations.py index d752aa9b7..f4966ccbe 100644 --- a/functions/get_averagecitations.py +++ b/functions/get_averagecitations.py @@ -1,5 +1,14 @@ from www.services import * - +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_average_citations(df): """ @@ -11,7 +20,7 @@ def get_average_citations(df): Returns: A Plotly figure object representing the average citations per year. """ - data = df.get() + data = df.copy() # Calculate the current year current_year = pd.Timestamp.now().year + 1 diff --git a/functions/get_bradfordlaw.py b/functions/get_bradfordlaw.py index 86580591f..c609bc6d7 100644 --- a/functions/get_bradfordlaw.py +++ b/functions/get_bradfordlaw.py @@ -1,5 +1,15 @@ from www.services import * - +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +import numpy as np def get_bradford_law(df): """ @@ -12,7 +22,7 @@ def get_bradford_law(df): A Plotly figure object and a DataFrame of the Bradford's Law zones. """ # Sort data by frequency of occurrence (equivalent to R's sort(table(M$SO), decreasing = TRUE)) - data = df.get() + data = df.copy() source_counts = data["SO"].value_counts() # Total number of sources diff --git a/functions/get_citedcountries.py b/functions/get_citedcountries.py index ac95a8d0c..9d1fd619e 100644 --- a/functions/get_citedcountries.py +++ b/functions/get_citedcountries.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_cited_countries(df, num_of_cited_countries, cited_countries_measure): @@ -15,7 +25,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure): """ # Extract metadata tags for cited countries df = metaTagExtraction(df, "AU1_CO") - df = df.get() + df = df.copy() # Prepare the table for ranking countries tab = ( @@ -68,7 +78,7 @@ def get_cited_countries(df, num_of_cited_countries, cited_countries_measure): y=list(range(n)), mode="markers+text", marker=dict( - size=18 + 6 * (x_values / x_values.max()), + size=(18 + 6 * (x_values / x_values.max())).fillna(18), color=x_values, colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_citeddocuments.py b/functions/get_citeddocuments.py index 14491f74a..e6cf3c52c 100644 --- a/functions/get_citeddocuments.py +++ b/functions/get_citeddocuments.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_cited_documents(df, num_of_cited_docs, cited_docs_measure): @@ -15,7 +25,7 @@ def get_cited_documents(df, num_of_cited_docs, cited_docs_measure): """ # Extract metadata tags for cited documents df = metaTagExtraction(df, "SR") - df = df.get() + df = df.copy() # Prepare the table for ranking documents current_year = pd.to_datetime("today").year @@ -74,7 +84,7 @@ def get_cited_documents(df, num_of_cited_docs, cited_docs_measure): y=y_vals, mode="markers+text", marker=dict( - size=18 + 6 * (tab[tab.columns[1]] / tab[tab.columns[1]].max()), + size=(18 + 6 * (tab[tab.columns[1]] / tab[tab.columns[1]].max())).fillna(18), color=tab[tab.columns[1]], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_clusteringcoupling.py b/functions/get_clusteringcoupling.py index 8263a46b3..864313484 100644 --- a/functions/get_clusteringcoupling.py +++ b/functions/get_clusteringcoupling.py @@ -1,4 +1,20 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from www.services.couplingmap import couplingMap, avoid_net_overlaps +import igraph as ig +from pyvis.network import Network +import tempfile +import os + def get_clustering_coupling(df, unit_of_analysis, coupling_measured, stemmer, impact_measure, diff --git a/functions/get_co_occurence_network.py b/functions/get_co_occurence_network.py index ec96b143a..b313eee52 100644 --- a/functions/get_co_occurence_network.py +++ b/functions/get_co_occurence_network.py @@ -1,4 +1,20 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from www.services.networkplot import network_plot +from pyvis.network import Network +from www.services.couplingmap import avoid_net_overlaps +import tempfile +import os +import matplotlib.pyplot as plt def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_algorithm_cn, normalization_cn, color_by_year, num_of_nodes, @@ -136,7 +152,7 @@ def get_co_occurence_network(df, field_cn, ngram, network_layout, clustering_alg # Generate layout # Using default igraph layout - layout = cocnet['graph']['layout'] + layout = cocnet['graph'].layout_fruchterman_reingold() print("Layout:", layout) # Get coordinates from layout coords = np.array([[pos[0], pos[1]] for pos in layout]) @@ -479,7 +495,7 @@ def field_by_year(df, field_cn, timespan=None, min_freq=2, n_items=5, remove_ter The field to analyze ('ID', 'DE', 'TI', 'AB', 'WC') """ # Get the field data - M = df.get() + M = df.copy() # Create co-occurrence matrix A = cocMatrix(df, field_cn, binary=False, remove_terms=remove_terms, synonyms=synonyms) diff --git a/functions/get_cocitation.py b/functions/get_cocitation.py index 8bad105c0..23ee7ab98 100644 --- a/functions/get_cocitation.py +++ b/functions/get_cocitation.py @@ -1,5 +1,20 @@ from www.services import * - +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from www.services.networkplot import network_plot +from pyvis.network import Network +from www.services.couplingmap import avoid_net_overlaps +import tempfile +import os +import json def get_co_citation( df, field, sep, cocit_network_layout, cocit_clustering_algorithm, cocit_repulsion, @@ -36,7 +51,9 @@ def get_co_citation( degree_plot (plotly.graph_objs.Figure): Degree distribution plot for network nodes. """ - M = df + M = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() + print("M type:", type(M)) + print("M columns:", M.columns.tolist() if isinstance(M, pd.DataFrame) else "NOT A DATAFRAME") # Prepare network and title based on field NetRefs = None @@ -95,7 +112,7 @@ def get_co_citation( b = np.random.randint(0, 255) cluster_colors[cluster_id] = f"rgba({r},{g},{b},0.7)" - layout = cocitnet['graph']['layout'] + layout = cocitnet['graph'].layout_fruchterman_reingold() coords = np.array([[pos[0], pos[1]] for pos in layout]) coords = coords / np.abs(coords).max() coords[:, 0] *= 1000 diff --git a/functions/get_collaborationnetwork.py b/functions/get_collaborationnetwork.py index 512ed7489..0e61058fa 100644 --- a/functions/get_collaborationnetwork.py +++ b/functions/get_collaborationnetwork.py @@ -1,5 +1,21 @@ from www.services import * import json +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from www.services.networkplot import network_plot +from pyvis.network import Network +from www.services.couplingmap import avoid_net_overlaps +import tempfile +import os + def get_collaboration_network( df, field, network_layout, clustering_algorithm, repulsion, shape, opacity, shadow, curved, colnormalize, labelsize, edgesize, label_cex, nodes, isolates, edges_min @@ -45,8 +61,11 @@ def get_collaboration_network( print("Generating collaboration network...") + print("isolates value:", isolates, type(isolates)) + df = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() + isolates = False M = df - m = df.get() + m = df.copy() NetRefs = None Title = "" @@ -74,27 +93,34 @@ def get_collaboration_network( normalize = None if colnormalize == "none" else colnormalize # Prepare network plot - netplot = network_plot( - NetMatrix=NetRefs, - normalize=normalize, - Title=Title, - type=network_layout if network_layout != "worldmap" else "auto", - size_cex=True, - size=5, - remove_multiple=False, - edgesize=edgesize * 3, - labelsize=labelsize, - label_cex=label_cex, - curved=curved, - label_n=label_n, - edges_min=edges_min, - label_color=False, - remove_isolates=isolates, - alpha=opacity, - cluster=clustering_algorithm, - community_repulsion=repulsion / 2, - verbose=False - ) + try: + netplot = network_plot( + NetMatrix=NetRefs, + normalize=normalize, + Title=Title, + type=network_layout if network_layout != "worldmap" else "auto", + size_cex=True, + size=5, + remove_multiple=False, + edgesize=edgesize * 3, + labelsize=labelsize, + label_cex=label_cex, + curved=curved, + label_n=label_n, + edges_min=edges_min, + label_color=False, + remove_isolates=isolates, + alpha=opacity, + cluster=clustering_algorithm, + community_repulsion=repulsion / 2, + verbose=False + ) + if len(netplot['graph'].vs) == 0: + raise ValueError("Network is empty. Use a larger dataset or disable 'Remove isolated nodes.") + except Exception as e: + import traceback + traceback.print_exc() + raise # Visualization (HTML, density plot, cluster table, degree plot) net = Network(height="98vh", width="100%", notebook=True, cdn_resources="in_line") @@ -108,7 +134,7 @@ def get_collaboration_network( b = np.random.randint(0, 255) cluster_colors[cluster_id] = f"rgba({r},{g},{b},{opacity})" - layout = netplot['graph']['layout'] + layout = netplot['graph'].layout_fruchterman_reingold() coords = np.array([[pos[0], pos[1]] for pos in layout]) coords = coords / np.abs(coords).max() coords[:, 0] *= 1000 diff --git a/functions/get_correspondingauthorcountries.py b/functions/get_correspondingauthorcountries.py index 5ba9832b2..ac8a9645f 100644 --- a/functions/get_correspondingauthorcountries.py +++ b/functions/get_correspondingauthorcountries.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_corresponding_author_countries(df, top_k_countries): @@ -15,7 +25,7 @@ def get_corresponding_author_countries(df, top_k_countries): # Estrai i metadati "AU_CO" e "AU1_CO" e verifica il tipo di dati df = metaTagExtraction(df, Field="AU_CO") # Assumendo che `metaTagExtraction` sia già definita df = metaTagExtraction(df, Field="AU1_CO") - data = df.get() # Se `df` è un oggetto reattivo + data = df.copy() # Se `df` è un oggetto reattivo # Assicurati che le colonne siano di tipo stringa e rimuovi righe con valori mancanti data = data.dropna(subset=["AU1_CO", "AU_CO"]) diff --git a/functions/get_countriesproduction.py b/functions/get_countriesproduction.py index 81c0e0c34..622d1d78a 100644 --- a/functions/get_countriesproduction.py +++ b/functions/get_countriesproduction.py @@ -1,5 +1,21 @@ from www.services import * - +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import geopandas as gpd def get_countries_production(df): """ @@ -13,7 +29,7 @@ def get_countries_production(df): """ # Assicurati che i metadati siano stati estratti df = metaTagExtraction(df, "AU_CO") - df = df.get() + df = df.copy() # Conta le occorrenze dei paesi df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x]) diff --git a/functions/get_countriesproductionovertime.py b/functions/get_countriesproductionovertime.py index aede25bbd..587ebfb4f 100644 --- a/functions/get_countriesproductionovertime.py +++ b/functions/get_countriesproductionovertime.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_countries_production_over_time(df, top_k_countries): @@ -13,7 +23,7 @@ def get_countries_production_over_time(df, top_k_countries): A Plotly figure object representing the country's production over time. """ df = metaTagExtraction(df, "AU_CO") - data = df.get() + data = df.copy() AFF = pd.Series(data["AU_CO"]).dropna().apply(lambda x: [aff.strip() for aff in x if aff.strip() != ""]) nAFF = [len(aff) for aff in AFF] diff --git a/functions/get_data.py b/functions/get_data.py index 16baed992..9e00ec3c5 100644 --- a/functions/get_data.py +++ b/functions/get_data.py @@ -1,4 +1,18 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from shiny.express import ui +from www.services.format_functions import biblio_json, process_multiple_files +from io import StringIO +from shiny.types import FileInfo def get_data(input, database, df, reset_callback=None): diff --git a/functions/get_database.py b/functions/get_database.py index 5c5d4edc5..733264d9e 100644 --- a/functions/get_database.py +++ b/functions/get_database.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_database(input): diff --git a/functions/get_factorialanalysis.py b/functions/get_factorialanalysis.py index 3324bcfb6..540ce6df7 100644 --- a/functions/get_factorialanalysis.py +++ b/functions/get_factorialanalysis.py @@ -1,5 +1,28 @@ from www.services import * from scipy.spatial import ConvexHull, QhullError +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +import numpy as np +from www.services.tabletag import table_tag +from typing import Union, Optional, Sequence, Dict, List +import math +from pyvis.network import Network +import tempfile +import os +from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, to_tree +from scipy.spatial.distance import pdist +from prince import CA, MCA +from sklearn.manifold import MDS as SK_MDS +from sklearn.preprocessing import StandardScaler + def distance_to_y(dist, max_dist, scale_factor): norm = math.log1p(dist) / math.log1p(max_dist) @@ -74,7 +97,7 @@ def get_factorial_analysis( # Set ngrams based on word_type ngrams = int(ngram) if field in ['TI', 'AB'] else 1 - M = df.get() + M = df.copy() tab = table_tag(M, field, ngrams) if len(tab) >= 2: diff --git a/functions/get_filters.py b/functions/get_filters.py index 206c215aa..83a1b7941 100644 --- a/functions/get_filters.py +++ b/functions/get_filters.py @@ -1,5 +1,15 @@ from www.services import * from functions.get_table import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_filters(df): @@ -12,7 +22,7 @@ def get_filters(df): Returns: A DataFrame with additional columns for filters and metrics. """ - data = df.get() + data = df.copy() # Calculate the minimum and maximum publication years data["Min_Year"] = data["PY"].min() diff --git a/functions/get_frequentwords.py b/functions/get_frequentwords.py index 8d790ffe1..74a0c37f3 100644 --- a/functions/get_frequentwords.py +++ b/functions/get_frequentwords.py @@ -1,4 +1,15 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from collections import Counter def get_frequent_words(df, ngram, num_of_words, word_type, file_upload_terms, file_upload_synonyms, field_separator_frequent=';'): @@ -100,7 +111,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): """ Extract and count words from a specified field in the DataFrame. """ - M = df.get() + M = df.copy() # Remove duplicates M = M.drop_duplicates(subset='SR') @@ -109,7 +120,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): if tag in ['AB', 'TI']: text_data = term_extraction(df, field=tag, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - text_data = text_data.get() + text_data = text_data[f"{tag}_TM"] else: text_data = M[tag] diff --git a/functions/get_historiograph.py b/functions/get_historiograph.py index 089d02387..1c6763542 100644 --- a/functions/get_historiograph.py +++ b/functions/get_historiograph.py @@ -2,9 +2,19 @@ from pyvis.network import Network import tempfile import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go import networkx as nx import os from matplotlib.colors import to_rgba +from www.services.histplot import histPlot def hex_to_rgba(hex_color, alpha): if not isinstance(hex_color, str) or not hex_color.startswith("#") or len(hex_color) != 7: @@ -27,18 +37,24 @@ def get_historiograph(df, node_label="AU1", histNodes=20, hist_isolates=True, hi filename: nome del file HTML interattivo salvato temporaneamente """ # Pre-elaborazione + df = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() df = metaTagExtraction(df, "SR") hist_results = histNetwork(df, min_citations=0, sep=sep, network=True) # 1. Costruzione iniziale del grafo - hist_plot = histPlot( - hist_results, - n=histNodes, - size=histsize, - remove_isolates=False, # rimozione manuale - label=node_label, - verbose=False - ) + try: + hist_plot = histPlot( + hist_results, + n=histNodes, + size=histsize, + remove_isolates=False, # rimozione manuale + label=node_label, + verbose=False + ) + except Exception as e: + import traceback + traceback.print_exc() + raise # 2. Recupera layout e rete iniziale layout_df = pd.DataFrame(hist_plot["layout"]).copy() @@ -90,17 +106,24 @@ def get_historiograph(df, node_label="AU1", histNodes=20, hist_isolates=True, hi # Rimozione Year mancanti hist_data = hist_data[hist_data["Year"].notna()].copy() if hist_data.empty: - raise ValueError("Nessun dato con 'Year' valido per la historiograph.") + hist_data = hist_results["histData"].copy() + hist_data["Year"] = hist_data["Year"].fillna(0) # Posizionamento temporale orizzontale - hist_data = hist_data.sort_values(['cluster', 'Year']) + if 'cluster' in hist_data.columns: + hist_data = hist_data.sort_values(['cluster','Year']) + else: + hist_data = hist_data.sort_values(['Year']) min_year = hist_data["Year"].min() year_range = hist_data["Year"].max() - min_year + 1 # Spazio orizzontale compatto hist_data["x"] = (hist_data["Year"] - min_year) * 60 # invece di /year_range * 1000 # Spazio verticale più ravvicinato tra cluster - hist_data["y"] = hist_data["cluster"] * 150 + np.random.uniform(-30, 30, size=len(hist_data)) + if 'cluster' in hist_data.columns: + hist_data["y"] = hist_data["cluster"] * 150 + np.random.uniform(-30, 30, size=len(hist_data)) + else: + hist_data["y"] = np.random.uniform(-30, 30, size=len(hist_data)) # Tooltip e label robusti diff --git a/functions/get_localcitedauthors.py b/functions/get_localcitedauthors.py index e663192bc..79795e6fd 100644 --- a/functions/get_localcitedauthors.py +++ b/functions/get_localcitedauthors.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_authors(df, num_of_cited_authors, fast_search=False): @@ -20,7 +30,7 @@ def get_local_cited_authors(df, num_of_cited_authors, fast_search=False): loccit = 1 df = metaTagExtraction(df, "SR") - M = df.get() + M = df.copy() # Fill missing values M['TC'] = M['TC'].fillna(0) @@ -68,13 +78,17 @@ def get_local_cited_authors(df, num_of_cited_authors, fast_search=False): layer="below", ) + for col in author_counts.columns: + if author_counts[col].dtype in ['float64','int64']: + author_counts[col] = author_counts[col].fillna(0) + fig.add_trace( go.Scatter( x=author_counts[frequency], y=list(range(len(author_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (author_counts[frequency] / author_counts[frequency].max()), + size=(18 + 6 * (author_counts[frequency] / author_counts[frequency].max())).fillna(18), color=author_counts[frequency], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_localciteddocuments.py b/functions/get_localciteddocuments.py index 1dea8d5a5..442310d95 100644 --- a/functions/get_localciteddocuments.py +++ b/functions/get_localciteddocuments.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_documents(df, num_of_local_cited_docs, field_separator, fast_search=False): @@ -14,7 +24,7 @@ def get_local_cited_documents(df, num_of_local_cited_docs, field_separator, fast A Plotly figure object and a DataFrame of the most local cited documents. """ df = metaTagExtraction(df, "SR") - M = df.get() + M = df.copy() # Determine the local citation threshold if fast_search: @@ -79,7 +89,7 @@ def get_local_cited_documents(df, num_of_local_cited_docs, field_separator, fast y=list(range(len(df_documents))), mode="markers+text", marker=dict( - size=18 + 6 * (df_documents["Local Citations"] / df_documents["Local Citations"].max()), + size=(18 + 6 * (df_documents["Local Citations"] / df_documents["Local Citations"].max())).fillna(18), color=df_documents["Local Citations"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_localcitedreferences.py b/functions/get_localcitedreferences.py index 68ea11fef..17e8919b1 100644 --- a/functions/get_localcitedreferences.py +++ b/functions/get_localcitedreferences.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_refs(df, num_of_cited_refs, field_separator): @@ -13,7 +23,7 @@ def get_local_cited_refs(df, num_of_cited_refs, field_separator): Returns: A Plotly figure object and a DataFrame of the most local cited sources. """ - data = df.get() + data = df.copy() if isinstance(data["CR"].iloc[0], list): # Check if the first element is a list # Flatten the 'CR' column containing lists @@ -63,7 +73,7 @@ def get_local_cited_refs(df, num_of_cited_refs, field_separator): y=list(range(len(source_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (source_counts["Citations"] / source_counts["Citations"].max()), + size=(18 + 6 * (source_counts["Citations"] / source_counts["Citations"].max())).fillna(18), color=source_counts["Citations"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_localcitedsources.py b/functions/get_localcitedsources.py index 74b261455..3dda4007a 100644 --- a/functions/get_localcitedsources.py +++ b/functions/get_localcitedsources.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_local_cited_sources(df, num_of_cited_sources): @@ -16,7 +26,7 @@ def get_local_cited_sources(df, num_of_cited_sources): # Extract metadata tags for cited sources df = metaTagExtraction(df, "CR_SO") - data = df.get() + data = df.copy() if isinstance(data["CR_SO"].iloc[0], list): # Check if the first element is a list # Flatten the 'CR_SO' column containing lists @@ -55,7 +65,7 @@ def wrap_label(label, width=50): y=list(range(len(source_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (source_counts["N. of Local Citations"] / source_counts["N. of Local Citations"].max()), + size=(18 + 6 * (source_counts["N. of Local Citations"] / source_counts["N. of Local Citations"].max())).fillna(18), color=source_counts["N. of Local Citations"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_lotkalaw.py b/functions/get_lotkalaw.py index 94545fda2..778830f7f 100644 --- a/functions/get_lotkalaw.py +++ b/functions/get_lotkalaw.py @@ -1,5 +1,15 @@ from www.services import * - +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +import numpy as np def get_lotka_law(df): """ @@ -14,7 +24,7 @@ def get_lotka_law(df): """ # Calculate Lotka's Law - data = df.get() + data = df.copy() # Author Productivity (Lotka's Law) authors = pd.Series([author.strip() for sublist in data['AU'] for author in sublist]) diff --git a/functions/get_maininformations.py b/functions/get_maininformations.py index 97443abdb..764ff43e9 100644 --- a/functions/get_maininformations.py +++ b/functions/get_maininformations.py @@ -1,4 +1,14 @@ from www.services import * +import time +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_main_informations(df, log=False): @@ -12,13 +22,13 @@ def get_main_informations(df, log=False): Returns: A DataFrame with additional columns for filters and metrics. """ - data = df.get() + data = df.copy() #### Min and Max Year #### start_time = time.time() # Calculate the minimum and maximum publication years - data["Min_Year"] = data["PY"].min() - data["Max_Year"] = data["PY"].max() + data["Min_Year"] = pd.to_numeric(data["PY"], errors="coerce").min() + data["Max_Year"] = pd.to_numeric(data["PY"], errors="coerce").max() print(f"Min and Max Year calculation time: {time.time() - start_time:.4f} seconds") #### Unique Sources #### @@ -98,8 +108,8 @@ def count_authors(entry): # Ensure the 'AU_CO' column exists if "AU_CO" not in data.columns: # Extract the required metadata - df = metaTagExtraction(df, "AU_CO") - data = df.get() + df_temp = metaTagExtraction(df, "AU_CO") + data["AU_CO"] = df_temp["AU_CO"] # Calculate "Country_Count" with a vectorized function data["Country_Count"] = data["AU_CO"].apply(lambda x: len(set(x))) diff --git a/functions/get_referencesspectroscopy.py b/functions/get_referencesspectroscopy.py index a2c3e1522..97bf15162 100644 --- a/functions/get_referencesspectroscopy.py +++ b/functions/get_referencesspectroscopy.py @@ -1,4 +1,16 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +import re +from plotly.subplots import make_subplots def get_references_spectroscopy(df, start_year, end_year=2005, field_separator_spec=';'): @@ -16,7 +28,7 @@ def get_references_spectroscopy(df, start_year, end_year=2005, field_separator_s rpys_table (pd.DataFrame): Table with RPYS data (years, citations, deviation from median, top references). cr_table (pd.DataFrame): Table of cited references with local citation counts and Google Scholar links. """ - df = df.get() + df = df.copy() # Pulizia e preparazione dei dati c_references = df['CR'].apply(lambda x: [i for i in x]).explode() diff --git a/functions/get_relevantaffiliations.py b/functions/get_relevantaffiliations.py index b86e36509..20268d7e9 100644 --- a/functions/get_relevantaffiliations.py +++ b/functions/get_relevantaffiliations.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_relevant_affiliations(df, num_of_affiliations, disambiguation): @@ -13,7 +23,7 @@ def get_relevant_affiliations(df, num_of_affiliations, disambiguation): Returns: A Plotly figure object and a DataFrame of the most relevant authors. """ - data = df.get() + data = df.copy() if disambiguation == "yes": # Extract affiliations from the "AU_UN" field @@ -40,7 +50,7 @@ def get_relevant_affiliations(df, num_of_affiliations, disambiguation): y=list(range(len(affiliation_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (affiliation_counts["Articles"] / affiliation_counts["Articles"].max()), + size=(18 + 6 * (affiliation_counts["Articles"] / affiliation_counts["Articles"].max())).fillna(18), color=affiliation_counts["Articles"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_relevantauthors.py b/functions/get_relevantauthors.py index cdf960151..a7be9453a 100644 --- a/functions/get_relevantauthors.py +++ b/functions/get_relevantauthors.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_relevant_authors(df, num_of_authors, frequency="N. of Documents"): @@ -13,7 +23,7 @@ def get_relevant_authors(df, num_of_authors, frequency="N. of Documents"): Returns: A Plotly figure object and a DataFrame of the most relevant authors. """ - data = df.get() + data = df.copy() # Drop rows with missing values data = data.dropna(subset=["AU"]) @@ -72,7 +82,7 @@ def get_relevant_authors(df, num_of_authors, frequency="N. of Documents"): y=list(range(len(author_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (author_counts[frequency] / author_counts[frequency].max()), + size=(18 + 6 * (author_counts[frequency] / author_counts[frequency].max())).fillna(18), color=author_counts[frequency], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_relevantsources.py b/functions/get_relevantsources.py index dccd8d3e5..7199bba2e 100644 --- a/functions/get_relevantsources.py +++ b/functions/get_relevantsources.py @@ -1,4 +1,15 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go + def get_relevant_sources(df, num_of_sources): @@ -12,7 +23,7 @@ def get_relevant_sources(df, num_of_sources): Returns: A Plotly figure object and a DataFrame of the most relevant sources. """ - data = df.get() + data = df.copy() # Drop rows with missing values data = data.dropna(subset=["SO"]) @@ -55,7 +66,7 @@ def wrap_label(label, width=50): y=list(range(len(source_counts))), mode="markers+text", marker=dict( - size=18 + 6 * (source_counts["N. of Documents"] / source_counts["N. of Documents"].max()), + size=(18 + 6 * (source_counts["N. of Documents"] / source_counts["N. of Documents"].max())).fillna(18), color=source_counts["N. of Documents"], colorscale=[[0, "#B3D1F2"], [1, "#5567BB"]], line=dict(width=1, color="#E0E0E0"), diff --git a/functions/get_sourceslocalimpact.py b/functions/get_sourceslocalimpact.py index 731c97194..644726b7a 100644 --- a/functions/get_sourceslocalimpact.py +++ b/functions/get_sourceslocalimpact.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_sources_local_impact(df, num_of_sources_local_impact, source_local_impact): @@ -13,7 +23,7 @@ def get_sources_local_impact(df, num_of_sources_local_impact, source_local_impac Returns: A Plotly figure object and a DataFrame of the most impactful sources. """ - df = df.get() + df = df.copy() today = pd.Timestamp.now().year # Ensure 'TC' and 'PY' are numeric diff --git a/functions/get_sourcesproduction.py b/functions/get_sourcesproduction.py index 0795668d7..cf3ca0cbf 100644 --- a/functions/get_sourcesproduction.py +++ b/functions/get_sourcesproduction.py @@ -1,4 +1,15 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from www. services.cocmatrix import cocMatrix def get_sources_production(df, num_of_sources_production, occurences): @@ -13,7 +24,7 @@ def get_sources_production(df, num_of_sources_production, occurences): Returns: A Plotly figure object representing the sources' production over time. """ - data = df.get() + data = df.copy() # Calculate the number of publications per year for each source WSO = cocMatrix(df, Field="SO") diff --git a/functions/get_status.py b/functions/get_status.py index b5c412e67..5b717332e 100644 --- a/functions/get_status.py +++ b/functions/get_status.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_status(missing_percentage): diff --git a/functions/get_table.py b/functions/get_table.py index 75b9c91d8..170bc7e87 100644 --- a/functions/get_table.py +++ b/functions/get_table.py @@ -1,5 +1,19 @@ from www.services import * from functions.get_status import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from shiny.express import ui +from www.services.utils import ICONS +from itables import to_html_datatable as DT +from itables.javascript import JavascriptFunction # Function to create a Plotly table visualization for metadata completeness @@ -79,7 +93,11 @@ def get_table(database, df, dpi=300, filter=False, modal=True): A DataTable object if data is available, otherwise a message indicating no data. """ # Retrieve the data from the DataFrame - data = df.get() + if df is None: + return None, None, None + data = df() if callable(df) else df.copy() + if data is None: + return None, None, None table_html = "" fig = None @@ -205,7 +223,7 @@ def get_table(database, df, dpi=300, filter=False, modal=True): # Return a DataTable object with the data and the HTML/Plotly tables return ui.HTML( DT( - df.get(), + data, maxBytes="10MB", classes="display compact stripe", style="text-transform: uppercase; font-size: small; table-layout: auto;", diff --git a/functions/get_thematicevolution.py b/functions/get_thematicevolution.py index 65bb0077b..8c50c7e96 100644 --- a/functions/get_thematicevolution.py +++ b/functions/get_thematicevolution.py @@ -1,4 +1,21 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from pyvis.network import Network +import tempfile +import os +import matplotlib.pyplot as plt +import matplotlib.colors as mcolors +from matplotlib.colors import to_hex +from www.services.thematicmap import thematic_map def get_thematic_evolution(df, field="ID", years=None, n=250, weight_index="inc_index", min_weight_index=0.1, minFreq=2, @@ -94,7 +111,6 @@ def thematic_evolution(M, field="ID", years=None, n=250, min_freq=2, size=0.5, n for interval_label, Mk in list_df.items(): Y.append(f"{min(Mk['PY'])}-{max(Mk['PY'])}") - Mk = reactive.Value(Mk) resk_tuple = thematic_map( Mk, field=field, n=n, minfreq=min_freq, ngrams=ngrams, @@ -310,7 +326,7 @@ def timeslice(M, breaks=None, k=5): Returns: dict: Dictionary containing DataFrames for each sub-period. """ - M = M.get() + # Convert the 'PY' column to numeric M['PY'] = pd.to_numeric(M['PY'], errors='coerce') @@ -320,6 +336,9 @@ def timeslice(M, breaks=None, k=5): breaks = np.floor(np.linspace(M['PY'].min() - 1, M['PY'].max(), k + 1)) else: breaks = [M['PY'].min() - 1] + breaks + [M['PY'].max()] + breaks = sorted(list(set(breaks))) + if len(breaks) < 2: + raise ValueError("Not enough distinct break points for time sclicing.") # print("breaks:", breaks) @@ -332,6 +351,7 @@ def timeslice(M, breaks=None, k=5): # Split the DataFrame based on intervals split_df = {str(interval): M[M['interval'] == interval].drop(columns=['interval']) for interval in intervals} + split_df = {k: v for k,v in split_df.items() if len(v) > 0} return split_df diff --git a/functions/get_thematicmap.py b/functions/get_thematicmap.py index 68d1f37d6..009e185a0 100644 --- a/functions/get_thematicmap.py +++ b/functions/get_thematicmap.py @@ -1,4 +1,15 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from www.services.thematicmap import thematic_map def get_thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False): diff --git a/functions/get_threefieldplot.py b/functions/get_threefieldplot.py index b7a4a1514..2afbf1800 100644 --- a/functions/get_threefieldplot.py +++ b/functions/get_threefieldplot.py @@ -1,5 +1,15 @@ from www.services import * import textwrap +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_three_field_plot(df, left_field, middle_field, right_field, left_field_items, middle_field_items, right_field_items): diff --git a/functions/get_treemap.py b/functions/get_treemap.py index 1f3f765f0..1e2f9f8f7 100644 --- a/functions/get_treemap.py +++ b/functions/get_treemap.py @@ -1,4 +1,16 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from collections import Counter + def get_treemap(df, ngram, num_of_words, word_type, file_upload_terms, file_upload_synonyms, field_separator_frequent=';'): @@ -75,7 +87,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): """ Extract and count words from a specified field in the DataFrame. """ - M = df.get() + M = df.copy() # Remove duplicates M = M.drop_duplicates(subset='SR') @@ -84,7 +96,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): if tag in ['AB', 'TI']: text_data = term_extraction(df, field=tag, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - text_data = text_data.get() + text_data = text_data[f"{tag}_TM"] else: text_data = M[tag] diff --git a/functions/get_trendtopics.py b/functions/get_trendtopics.py index 1d2f1df3a..00aa7f762 100644 --- a/functions/get_trendtopics.py +++ b/functions/get_trendtopics.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_trend_topics(df, ngram, field_tt, time_window, file_upload_terms_tt, file_upload_synonyms_tt, word_minimum_frequency, number_of_words_year): @@ -49,6 +59,8 @@ def get_trend_topics(df, ngram, field_tt, time_window, file_upload_terms_tt, fil # Get trend topics trend_topics = field_by_year(df, field, time_window, word_minimum_frequency, number_of_words_year, remove_terms, synonyms) + print(trend_topics.columns.tolist()) + print(trend_topics.head()) # Plot fig = px.scatter(trend_topics, x='year_med', y='item', size='freq', hover_data=['year_q1', 'year_q3'], height=800) @@ -99,13 +111,14 @@ def field_by_year(df, field, timespan, min_freq, n_items, remove_terms=None, syn # Create co-occurrence matrix A = cocMatrix(df, Field=field, binary=False, remove_terms=remove_terms, synonyms=synonyms) n = A.sum(axis=0).to_numpy() # Convert to 1D array - df = df.get() + df = df.copy() # Calculate quantiles trend_med = pd.DataFrame(A.values).apply(lambda x: pd.Series(np.round(np.quantile(np.repeat(df['PY'], x), [0.25, 0.5, 0.75]))), axis=0).T trend_med.columns = ['year_q1', 'year_med', 'year_q3'] trend_med['freq'] = n - trend_med['item'] = A.columns + trend_med['item'] = A.columns.tolist() + trend_med['year_med'] = trend_med['year_med'].astype(float) # Filter by timespan and frequency if timespan is None or len(timespan) != 2: @@ -113,6 +126,6 @@ def field_by_year(df, field, timespan, min_freq, n_items, remove_terms=None, syn trend_med = trend_med[(trend_med['year_med'] >= timespan[0]) & (trend_med['year_med'] <= timespan[1])] trend_med = trend_med[trend_med['freq'] >= min_freq] - trend_med = trend_med.groupby('year_med').apply(lambda x: x.nlargest(n_items, 'freq')).reset_index(drop=True) + trend_med = trend_med.sort_values('freq', ascending=False).groupby('year_med', group_keys=False).head(n_items).reset_index(drop=True) return trend_med diff --git a/functions/get_wordcloud.py b/functions/get_wordcloud.py index e902f3bd6..2631ead1f 100644 --- a/functions/get_wordcloud.py +++ b/functions/get_wordcloud.py @@ -1,4 +1,22 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go +from collections import Counter +import networkx as nx +import matplotlib.colors as mcolors +import random +import math +from pyvis.network import Network +import tempfile +import os def is_legible_on_white(color): @@ -106,7 +124,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): """ Extract and count words from a specified field in the DataFrame. """ - M = df.get() + M = df.copy() # Remove duplicates M = M.drop_duplicates(subset='SR') @@ -115,7 +133,7 @@ def table_tag(df, tag, ngrams=1, remove_terms=None, synonyms=None): if tag in ['AB', 'TI']: text_data = term_extraction(df, field=tag, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - text_data = text_data.get() + text_data = text_data[f"{tag}_TM"] else: text_data = M[tag] diff --git a/functions/get_wordfrequency.py b/functions/get_wordfrequency.py index 1f2b81a06..4015e23c3 100644 --- a/functions/get_wordfrequency.py +++ b/functions/get_wordfrequency.py @@ -1,4 +1,14 @@ from www.services import * +import pandas as pd +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork +import plotly.express as px +import plotly.graph_objects as go def get_word_frequency(df, ngram, field_wf, file_upload_terms_wf, file_upload_synonyms_wf, occurrences, top_words): @@ -39,7 +49,7 @@ def get_word_frequency(df, ngram, field_wf, file_upload_terms_wf, file_upload_sy data = term_extraction(df, field=field_wf, stemming=False, verbose=False, ngrams=ngrams, remove_terms=remove_terms, synonyms=synonyms) - data = data.get() + if field_wf == 'TI': print(data[f"{field_wf}_TM"]) diff --git a/functions/get_worldmapcollaboration.py b/functions/get_worldmapcollaboration.py index 9edafa879..41a103395 100644 --- a/functions/get_worldmapcollaboration.py +++ b/functions/get_worldmapcollaboration.py @@ -1,17 +1,26 @@ from www.services import * import pandas as pd -import geopandas as gpd -import networkx as nx +import numpy as np +from www.services.metatagextraction import metaTagExtraction +from www.services.termextraction import term_extraction +from www.services.biblionetwork import biblionetwork +from www.services.biblionetwork import biblionetwork +from www.services.cocmatrix import cocMatrix +from www.services.histnetwork import histNetwork import plotly.express as px import plotly.graph_objects as go +import geopandas as gpd +import networkx as nx +import os def get_world_map_collaboration(df, edges_min=1, edgesize=5): # Estrai metadati dai paesi (assumi che tu abbia già AU_CO processato) - M = df - df = metaTagExtraction(df, "AU_CO") - df = df.get() - + df = df.get() if hasattr(df, 'get') and not isinstance(df, pd.DataFrame) else df.copy() + if "AU_CO" not in df.columns: + df = metaTagExtraction(df, "AU_CO") + M = df.copy() + # Normalizza e conta le occorrenze dei paesi (come in get_countries_production) df["AU_CO"] = df["AU_CO"].apply(lambda x: x if isinstance(x, list) else [x]) df = df.explode("AU_CO") @@ -32,6 +41,8 @@ def clean_country_names(country): # Costruisci matrice di collaborazione net = biblionetwork(M, analysis="collaboration", network="countries") + if net is None or net.empty: + return go.FigureWidget(go.Figure()), pd.DataFrame(columns=['From', 'To', 'count']) net_df = pd.DataFrame(net) # Costruisci rete diff --git a/pubmed_test.csv b/pubmed_test.csv new file mode 100644 index 000000000..adcafa3cf --- /dev/null +++ b/pubmed_test.csv @@ -0,0 +1,21 @@ +TI,AB,PY,SO,JI,VL,IS,BP,EP,DI,UT,PMID,DT,LA,TC,AU,AF,C1,CR,DE,ID,RP,DB,SR +SIFA: A two-stage adaptive ensemble framework for solar irradiance forecasting,Accurate solar irradiance forecasting is increasingly crucial for managing solar,2026,Scientific reports,Sci Rep,16,1,,,17998 [pii];10.1038/s41598-026-53183-2 [doi],42271158,42271158,Journal Article,eng,0,"['Abdel-Basset M', 'Mohamed R', 'Alrashdi I', 'Mahdi M']","['Abdel-Basset, Mohamed', 'Mohamed, Reda', 'Alrashdi, Ibrahim', 'Mahdi, Mahmoud']","['Faculty of Computers and Informatics, Zagazig University, Zagazig, 44519,', 'Faculty of Computers and Informatics, Zagazig University, Zagazig, 44519,', 'Department of Computer Science, College of Computer and Information Sciences,', 'Faculty of Computers and Informatics, Zagazig University, Zagazig, 44519,']",[],[],[],,PUBMED,"Abdel-Basset M, 2026, Scientific reports" +Stabilizing-sensing synergistic geogrid for high-speed railways.,High-speed railway systems demand robust substructure performance and real-time,2026,Nature communications,Nat Commun,,,,,10.1038/s41467-026-74260-0 [doi],42271141,42271141,Journal Article,eng,0,"['Fu H', 'Zhang H', 'Xiao L', 'Zhan W', 'Jiang J', 'Chen Y', 'Bian X']","['Fu, Haoran', 'Zhang, Haoyu', 'Xiao, Liuyang', 'Zhan, Wenhao', 'Jiang, Jianqun', 'Chen, Yunmin', 'Bian, Xuecheng']","['Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang', 'Department of Civil Engineering, Zhejiang University, Hangzhou, China.', 'MOE Key Laboratory of Soft Soils and Geoenvironmental Engineering, Zhejiang']",[],[],[],,PUBMED,"Fu H, 2026, Nature communications" +Morphologically tunable mycelium chips for physical reservoir computing.,We introduce a neuromorphic computing substrate based on PEDOT:PSS-infused,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-55550-5 [doi],42271130,42271130,Journal Article,eng,0,"['Telhan O', 'Winiski J', 'Schaak D', 'Siegel M', 'Petrillo N', 'Bayer E']","['Telhan, Orkan', 'Winiski, Jake', 'Schaak, Damen', 'Siegel, Michael', 'Petrillo, Neale', 'Bayer, Eben']","['Ecovative LLC, Green Island, NY, 12183, USA. orkan@design.bio.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.', 'Ecovative LLC, Green Island, NY, 12183, USA.']",[],[],[],,PUBMED,"Telhan O, 2026, Scientific reports" +Transforming hemodialysis care: a tripartite collaboration model among medical,Hemodialysis demand is rising as populations age and the chronic kidney disease,2026,Clinical and experimental nephrology,Clin Exp Nephrol,,,,,10.1007/s10157-026-02903-z [doi],42271122,42271122,Journal Article;Review,eng,0,"['Noda R', 'Sakurada T', 'Ichikawa D', 'Shibagaki Y']","['Noda, Ryunosuke', 'Sakurada, Tsutomu', 'Ichikawa, Daisuke', 'Shibagaki, Yugo']","['Division of Nephrology and Hypertension, Department of Internal Medicine, St.', 'Division of Nephrology and Hypertension, Department of Internal Medicine, St.', 'Division of Nephrology and Hypertension, Department of Internal Medicine, St.', 'Division of Nephrology and Hypertension, Department of Internal Medicine, St.']",[],[],[],,PUBMED,"Noda R, 2026, Clinical and experimental nephrology" +Deep Learning-Based Metal Artifact Reduction in Cardiac Computed Tomography: A,Idiopathic ventricular fibrillation (IVF) affects 5-10% of out-of-hospital,2026,Journal of imaging informatics in medicine,J Imaging Inform Med,,,,,10.1007/s10278-026-02030-x [doi],42271106,42271106,Journal Article,eng,0,"['Benigni N', 'Lo Iacono F', 'Verheul LM', 'Guglielmo M', 'Volders P', 'Ter Bekke R', 'Pontone G', 'Hassink RJ', 'Corino VDA']","['Benigni, Nicholas', 'Lo Iacono, Francesca', 'Verheul, Lisa M', 'Guglielmo, Marco', 'Volders, Paul', 'Ter Bekke, Rachel', 'Pontone, Gianluca', 'Hassink, Rutger J', 'Corino, Valentina D A']","['CardioTechLab, Centro Cardiologico Monzino IRCCS, Milan, Italy.', 'Department of Electronics, Information and Bioengineering, Politecnico Di Milano', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Cardiology, University Medical Center Utrecht, Utrecht, The', 'Department of Biomedical, Surgical and Dental Sciences, University of Milan,', 'Perioperative Cardiology and Cardiovascular Imaging Department, Centro', 'Department of Cardiology, Cardiovascular Research Institute Maastricht (CARIM),', 'CardioTechLab, Centro Cardiologico Monzino IRCCS, Milan, Italy.', 'Department of Electronics, Information and Bioengineering, Politecnico Di Milano']",[],[],[],,PUBMED,"Benigni N, 2026, Journal of imaging informatics in medicine" +Spatiotemporal trends of foot and mouth disease (FMD) in Bangladesh from 2017 to,Foot-and-mouth disease (FMD) is a highly contagious transboundary viral disease,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-57440-2 [doi],42271042,42271042,Journal Article,eng,0,"['Ahmed MJ', 'Alam KE', 'Talukder F', 'Bhandari P', 'Bhuiyan MIH', 'Mamun M', 'Rahman MA', 'Chalise R', 'Hossain MI', 'Morshed M', 'Sabrin MS', 'Chowdhury MTI', 'Alam MJ', 'Hossain D']","['Ahmed, Md Jisan', 'Alam, Kazi Estieque', 'Talukder, Faisol', 'Bhandari, Prajwal', 'Bhuiyan, Md Ismile Hossain', 'Mamun, Mustakim', 'Rahman, Md Arifur', 'Chalise, Ritu', 'Hossain, Md Imran', 'Morshed, Moheuddin', 'Sabrin, Mirza Synthia', 'Chowdhury, Md Tazul Islam', 'Alam, Md Jahangir', 'Hossain, Delower']","['Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Pathology, Faculty of Animal Science and Veterinary Medicine,', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Livestock Service (DLS), Dhaka, 1212, Bangladesh.', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Microbiology and Parasitology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Animal Production and Management, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Animal Production and Management, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Surgery and Theriogenology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Microbiology and Parasitology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Pathology, Faculty of Animal Science and Veterinary Medicine,', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Microbiology and Parasitology, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Agricultural Chemistry, Faculty of Agriculture, Sher-e-Bangla', 'Department of Animal Production and Management, Faculty of Animal Science and', 'Association of Coding, Technology, and Genomics (ACTG), Sher-e-Bangla', 'Department of Medicine and Public Health, Faculty of Animal Science and']",[],[],[],,PUBMED,"Ahmed MJ, 2026, Scientific reports" +Machine learning-assisted design of a wideband Fe-SiO(2)-MXene metamaterial solar,The manuscript proposed an efficient broadband metamaterial-inspired multilayered,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-57447-9 [doi],42271013,42271013,Journal Article,eng,0,"['Lavadiya S', 'Sorathiya V', 'Jaffar AY', 'Alsayegh AB', 'Khayyat KMJ']","['Lavadiya, Sunil', 'Sorathiya, Vishal', 'Jaffar, Amar Y', 'Alsayegh, Abdulghani Bakur', 'Khayyat, Khalid M Jamil']","['Department of Information and Communication Technology, Marwadi University,', 'Faculty of Engineering and Technology, Parul Institute of Engineering and', 'Computer and Network Engineering Department, College of Computing, Umm Al-Qura', 'Computer and Network Engineering Department, College of Computing, Umm Al-Qura', 'Computer and Network Engineering Department, College of Computing, Umm Al-Qura']",[],[],[],,PUBMED,"Lavadiya S, 2026, Scientific reports" +Human migration has surged since 2000 - these maps reveal where people are going.,,2026,Nature,Nature,,,,,10.1038/d41586-026-01796-y [doi],42271002,42271002,News,eng,0,['Naddaf M'],"['Naddaf, Miryam']",[],[],[],[],,PUBMED,"Naddaf M, 2026, Nature" +People are turning to AI chatbots to plug gaps in health information.,,2026,Nature,Nature,,,,,10.1038/d41586-026-01737-9 [doi],42270995,42270995,News,eng,0,['Gerstung M'],"['Gerstung, Moritz']",[],[],[],[],,PUBMED,"Gerstung M, 2026, Nature" +How I use AI to turn failed drugs into new medicines.,,2026,Nature,Nature,,,,,10.1038/d41586-026-01626-1 [doi],42270981,42270981,News,eng,0,['Ulker E'],"['Ulker, Emma']",[],[],[],[],,PUBMED,"Ulker E, 2026, Nature" +An active learning workflow for predicting misfit volume in body-centered cubic,Refractory high-entropy alloys (RHEAs) exhibit exceptional high-temperature,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-57006-2 [doi],42270978,42270978,Journal Article,eng,0,"['Liu S', 'Balachandran PV']","['Liu, Shunshun', 'Balachandran, Prasanna V']","['University of Virginia, Department of Materials Science and Engineering,', 'University of Virginia, Department of Materials Science and Engineering,']",[],[],[],,PUBMED,"Liu S, 2026, Scientific reports" +SERPINA3 and NDRG1 are critical diagnostic immune genes associated with,Preeclampsia (PE) is a pregnancy complication involving immune dysregulation.,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56930-7 [doi],42270919,42270919,Journal Article,eng,0,"['Wu Z', 'Chen S', 'Chen W', 'Xie Y', 'Zhou Z', 'Huang L', 'Wang Y', 'Chen B', 'Yang C', 'Ke Y']","['Wu, Zhuna', 'Chen, Shihong', 'Chen, Weihong', 'Xie, Yajing', 'Zhou, Zhimei', 'Huang, Li', 'Wang, Yueli', 'Chen, Binbin', 'Yang, Congmei', 'Ke, Yumin']","['Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian', 'Department of Gynecology and Obstetrics, The Second Affiliated Hospital of Fujian']",[],[],[],,PUBMED,"Wu Z, 2026, Scientific reports" +Hetairos is a histology-based artificial intelligence model for predicting,Molecular testing is essential for classifying central nervous system (CNS),2026,Nature cancer,Nat Cancer,,,,,10.1038/s43018-026-01186-3 [doi],42270902,42270902,Journal Article,eng,0,"['Jin D', 'Shmatko A', 'Patel A', 'Rutz S', 'Friedrich L', 'Banan R', 'Rahmanzade R', 'Sievers P', 'Hamelmann S', 'Schrimpf D', 'Gobel K', 'Bogumil H', 'Maas SLN', 'Sill M', 'Hinz FE', 'Suwala AK', 'Keller F', 'Habel A', 'Rukhovich G', 'Zettl F', 'Alhalabi OT', 'Ille S', 'Sehring J', 'Amsel D', 'Wiestler B', 'Piovesan Lago P', 'Suchorska B', 'Ahmad O', 'Sturm D', 'Reuss D', 'Wesseling P', 'Wohrer A', 'Heppner FL', 'Blumcke I', 'Delbridge C', 'Jakobs M', 'Herold-Mende C', 'Krieg SM', 'Wick W', 'Jones DTW', 'Pfister SM', 'Al-Hussaini M', 'Hou Y', ""D'Almeida Costa F"", 'Schweizer L', 'Bertero L', 'Acker T', 'Tauziede-Espariat A', 'Varlet P', 'Merkler D', 'Egervari K', 'Dohmen H', 'Zoroquiain P', 'Gejman R', 'Brandner S', 'Bai X', 'von Deimling A', 'Sahm F', 'Gerstung M']","['Jin, Darui', 'Shmatko, Artem', 'Patel, Areeba', 'Rutz, Samuel', 'Friedrich, Lukas', 'Banan, Rouzbeh', 'Rahmanzade, Ramin', 'Sievers, Philipp', 'Hamelmann, Stefan', 'Schrimpf, Daniel', 'Gobel, Kirsten', 'Bogumil, Henri', 'Maas, Sybren L N', 'Sill, Martin', 'Hinz, Felix E', 'Suwala, Abigail K', 'Keller, Felix', 'Habel, Antje', 'Rukhovich, Gleb', 'Zettl, Ferdinand', 'Alhalabi, Obada T', 'Ille, Sebastian', 'Sehring, Jannik', 'Amsel, Daniel', 'Wiestler, Benedikt', 'Piovesan Lago, Pedro', 'Suchorska, Bogdana', 'Ahmad, Olfat', 'Sturm, Dominik', 'Reuss, David', 'Wesseling, Pieter', 'Wohrer, Adelheid', 'Heppner, Frank L', 'Blumcke, Ingmar', 'Delbridge, Claire', 'Jakobs, Martin', 'Herold-Mende, Christel', 'Krieg, Sandro M', 'Wick, Wolfgang', 'Jones, David T W', 'Pfister, Stefan M', 'Al-Hussaini, Maysa', 'Hou, Yanghao', ""D'Almeida Costa, Felipe"", 'Schweizer, Leonille', 'Bertero, Luca', 'Acker, Till', 'Tauziede-Espariat, Arnault', 'Varlet, Pascale', 'Merkler, Doron', 'Egervari, Kristof', 'Dohmen, Hildegard', 'Zoroquiain, Pablo', 'Gejman, Roger', 'Brandner, Sebastian', 'Bai, Xiangzhi', 'von Deimling, Andreas', 'Sahm, Felix', 'Gerstung, Moritz']","['Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Image Processing Center, Beihang University, Beijing, China.', 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Faculty of Biosciences, Heidelberg University, Heidelberg, Germany.', 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Faculty of Mathematics and Computer Science, Heidelberg University, Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Pathology, Leiden University Medical Center, Leiden, The', 'Department of Pathology, Brain Tumor Center, Erasmus MC Cancer Institute,', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Neurooncology, German Cancer Research Center (DKFZ) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Faculty of Mathematics and Computer Science, Heidelberg University, Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'AI for Image-Guided Diagnosis and Therapy, School of Medicine and Health,', 'Munich Center for Machine Learning (MCML), Munich, Germany.', 'AC Camargo Cancer Center, Sao Paulo, Brazil.', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Glioma Research, German Cancer Research Center (DKFZ),', 'Department of Pediatric Hematology and Oncology, University Hospital Heidelberg,', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Princess Maxima Center for Pediatric Oncology, Utrecht, The Netherlands.', 'Department of Pathology, Amsterdam University Medical Centers/VUmc, Amsterdam,', 'Division of Neuropathology and Neurochemistry, Department of Neurology,', 'Institute of Neuropathology and Neuromolecular Pathology, Medical University of', 'Department of Neuropathology, Charite-Universitatsmedizin Berlin, corporate', 'German Center for Neurodegenerative Diseases (DZNE) within the Helmholtz', 'Department of Neuropathology, University Hospital Erlangen, Friedrich-Alexander', 'Institute of Pathology, School of Medicine and Health, Technical University of', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Division for Stereotactic Neurosurgery, Department of Neurosurgery, University', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Department of Neurosurgery, University Hospital Heidelberg, Heidelberg, Germany.', 'Neurology Clinic, University Hospital Heidelberg, Heidelberg, Germany.', 'Clinical Cooperation Unit Neurooncology, German Cancer Consortium (DKTK) and', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Glioma Research, German Cancer Research Center (DKFZ),', 'National Center for Tumor Diseases (NCT), Heidelberg, Germany.', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of Pediatric Neurooncology, German Cancer Research Center (DKFZ) and', 'Department of Pediatric Hematology and Oncology, University Hospital Heidelberg,', 'National Center for Tumor Diseases (NCT), Heidelberg, Germany.', 'Department of Cell Therapy and Applied Genomics, King Hussein Cancer Center,', 'Department of Pathology and Laboratory Medicine, King Hussein Cancer Center,', 'Department of Pathology, Center for Molecular Medicine Testing, College of Basic', 'Center for Medical Epigenetics, School of Basic Medical Sciences, Chongqing', 'AC Camargo Cancer Center, Sao Paulo, Brazil.', 'DASA Laboratories, Sao Paulo, Brazil.', 'Edinger Institute, Institute of Neurology, University of Frankfurt am Main,', 'German Cancer Consortium (DKTK) Partner Site Frankfurt/Mainz and German Cancer', 'Frankfurt Cancer Institute (FCI), Frankfurt am Main, Germany.', 'Department of Medical Sciences, University of Turin, Turin, Italy.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'Department of Neuropathology, Sainte-Anne Hospital, Paris, France.', 'Inserm, UMR 1266, IMA-Brain, Institut de Psychiatrie et Neurosciences de Paris,', 'Department of Neuropathology, Sainte-Anne Hospital, Paris, France.', 'Inserm, UMR 1266, IMA-Brain, Institut de Psychiatrie et Neurosciences de Paris,', 'Department of Pathology and Immunology, University of Geneva, Geneva,', 'Division of Clinical Pathology, Geneva University Hospital, Geneva, Switzerland.', 'Department of Pathology and Immunology, University of Geneva, Geneva,', 'Division of Clinical Pathology, Geneva University Hospital, Geneva, Switzerland.', 'Institute of Neuropathology, Justus Liebig University Giessen, Giessen, Germany.', 'Pathology Department, Faculty of Medicine, Pontificia Universidad Catolica de', 'Pathology Department, Faculty of Medicine, Pontificia Universidad Catolica de', 'Department of Neurodegenerative Disease, UCL Queen Square Institute of Neurology,', 'Image Processing Center, Beihang University, Beijing, China.', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', 'Department of Neuropathology, University Hospital Heidelberg, Heidelberg,', 'Clinical Cooperation Unit Neuropathology, German Cancer Consortium (DKTK) and', ""Hopp Children's Cancer Center (KiTZ), Heidelberg, Germany."", 'Division of AI in Oncology, German Cancer Research Center (DKFZ), Heidelberg,', 'Faculty of Mathematics and Computer Science, Heidelberg University, Heidelberg,']",[],[],[],,PUBMED,"Jin D, 2026, Nature cancer" +StyleGAN-based synthetic image augmentation for multi-class otoscopy image,Accurate diagnosis of eardrum abnormalities is pivotal for effectively managing,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56954-z [doi],42270897,42270897,Journal Article,eng,0,"['Camalan S', 'Langefeld CD', 'Zinnia A', 'Moberly AC', 'Gurcan MN']","['Camalan, Seda', 'Langefeld, Carl D', 'Zinnia, Amy', 'Moberly, Aaron C', 'Gurcan, Metin N']","['Center for Artificial Intelligence Research, Wake Forest University School of', 'Center for Artificial Intelligence Research, Wake Forest University School of', 'Biostatistics and Data Science, Wake Forest University School of Medicine,', 'Public Health Sciences, Wake Forest University School of Medicine, Winston-Salem,', 'Biostatistics and Data Science, Wake Forest University School of Medicine,', 'Dept. of Otolaryngology-Head and Neck Surgery, Vanderbilt University Medical', 'Center for Artificial Intelligence Research, Wake Forest University School of']",[],[],[],,PUBMED,"Camalan S, 2026, Scientific reports" +Spatiotemporal evolution and driving factors of water conservation capacity in,Water conservation services serve as a pivotal ecosystem service for water,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56962-z [doi],42270863,42270863,Journal Article,eng,0,"['Hou H', 'Guo F', 'Wang P', 'Lu D', 'Chen C', 'Bai J', 'Li H', 'Bao Z', 'Qin M', 'Liu Y', 'Fan X']","['Hou, Huimin', 'Guo, Feng', 'Wang, Pengquan', 'Lu, Di', 'Chen, Changjie', 'Bai, Junxing', 'Li, Haohao', 'Bao, Zhiqiang', 'Qin, Mingyang', 'Liu, Yufei', 'Fan, Xinjian']","['Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Qinghai Minzu University, Xining, 810007, Qinghai Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.', 'Lanzhou University of Technology, Lanzhou, 730050, Gansu Province, China.']",[],[],[],,PUBMED,"Hou H, 2026, Scientific reports" +Detecting application layer DDoS attack using an advanced signature detection,Application-layer Distributed Denial of Service (App-DDoS) attacks are an ongoing,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56617-z [doi],42270859,42270859,Journal Article,eng,0,"['Jaafar AG', 'Ngadi MA', 'Kama N', 'Kamarudin NH', 'Shapawi K']","['Jaafar, Abdul Ghafar', 'Ngadi, Md Asri', 'Kama, Nazri', 'Kamarudin, Nazhatul Hafizah', 'Shapawi, Khairol']","['Faculty of Artificial Intelligence, Universiti Teknologi Malaysia, Kuala Lumpur,', 'Faculty of Computing, Universiti Teknologi Malaysia, Johor Bahru, Johor,', 'Faculty of Artificial Intelligence, Universiti Teknologi Malaysia, Kuala Lumpur,', 'Centre for Cyber Security, Faculty of Information Science and Technology,', 'KYROL Security Labs Sdn. Bhd, Cyberjaya, Malaysia.']",[],[],[],,PUBMED,"Jaafar AG, 2026, Scientific reports" +An IBGWO optimized feature selection framework for sentiment analysis-based,Detecting fraudulent websites is critical to ensuring network security and,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56748-3 [doi],42270851,42270851,Journal Article,eng,0,"['Perumal S', 'Vishwanathan AJ']","['Perumal, Saraswathi', 'Vishwanathan, Anchitaalagammai Jayalakshmi']","['Department of Information Technology, Velammal College of Engineering and', 'Department of Computer Science and Engineering (Cyber Security), Velammal College']",[],[],[],,PUBMED,"Perumal S, 2026, Scientific reports" +Bulk and single-cell transcriptomics reveal prognostic signatures of,Lung adenocarcinoma (LUAD) is one of the most severe malignant tumors.,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56684-2 [doi],42270842,42270842,Journal Article,eng,0,"['Zhou L', 'Lei P', 'Luo Z', 'Xiao J', 'Chen Z']","['Zhou, Lihua', 'Lei, Peng', 'Luo, Zhouguang', 'Xiao, Jie', 'Chen, Zongyu']","['Department of Pulmonary and Critical Care Medicine, Affiliated Hospital of', 'Department of Neurosurgery, Affiliated Hospital of Guizhou Medical University,', ""Department of Infectious Disease, Longgang People's Hospital (The Longgang Branch"", 'Department of Emergency, Affiliated Hospital of Guizhou Medical University, No.', 'Department of Pulmonary and Critical Care Medicine, Affiliated Hospital of']",[],[],[],,PUBMED,"Zhou L, 2026, Scientific reports" +Machine learning-enabled ECG arrhythmia classification: a systematic and,Electrocardiogram (ECG) signals play a critical role in the early detection of,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56828-4 [doi],42270808,42270808,Journal Article,eng,0,['Melek N'],"['Melek, Negin']","['Faculty of Engineering and Natural Sciences, Gumushane University, Gumushane,']",[],[],[],,PUBMED,"Melek N, 2026, Scientific reports" +Integrative single-cell and spatial transcriptomics with machine learning,Triple-negative breast cancer is marked by extensive cellular heterogeneity and,2026,Scientific reports,Sci Rep,,,,,10.1038/s41598-026-56434-4 [doi],42270806,42270806,Journal Article,eng,0,"['Wu J', 'Fan J', 'Sha T', 'Li H']","['Wu, Jinpeng', 'Fan, Jingjing', 'Sha, Tong', 'Li, Hongtao']","['Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang', 'Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang', 'Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang', 'Department of Breast and Thyroid Surgery, Affiliated Tumor Hospital of Xinjiang']",[],[],[],,PUBMED,"Wu J, 2026, Scientific reports" diff --git a/www/services/__init__.py b/www/services/__init__.py index 28584e105..9a16e7bea 100644 --- a/www/services/__init__.py +++ b/www/services/__init__.py @@ -1,17 +1,2 @@ -from .biblionetwork import * -from .cocmatrix import * -from .couplingmap import * -from .format_functions import * -from .histnetwork import * -from .histplot import * -from .htmldownload import * -from .igraph2vis import * -from .metatagextraction import * -from .networkplot import * -from .parsers import * -from .plotlydownload import * -from .savereport import * -from .tabletag import * -from .termextraction import * -from .thematicmap import * -from .utils import * \ No newline at end of file +# Selective imports to avoid loading heavy dependencies automatically. +# Individual modules can still be imported directly when needed. \ No newline at end of file diff --git a/www/services/biblionetwork.py b/www/services/biblionetwork.py index 7e65b4880..8e0288300 100644 --- a/www/services/biblionetwork.py +++ b/www/services/biblionetwork.py @@ -71,11 +71,11 @@ def crossprod(A, B): filtered_index = [idx for idx in NetMatrix.index if str(idx).strip()] NetMatrix = NetMatrix.loc[filtered_index, filtered_columns] - M = M.get() # Estrai il dizionario se M è un oggetto + M = M.get() if hasattr(M, 'get') and callable(M.get) and not isinstance(M, pd.DataFrame) else M.copy() # Estrai il dizionario se M è un oggetto db_name = M["DB"].iloc[0] print(f"db_name: {db_name}") - if network == "references" and db_name == "SCOPUS": + if network == "references" and db_name.upper() == "SCOPUS": ind = [i for i, col in enumerate(NetMatrix.columns) if str(col)[0].isalpha()] NetMatrix = NetMatrix.iloc[ind, ind] diff --git a/www/services/cocmatrix.py b/www/services/cocmatrix.py index f523aed67..0da83cb2f 100644 --- a/www/services/cocmatrix.py +++ b/www/services/cocmatrix.py @@ -19,7 +19,7 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short Returns: A bipartite network matrix with cases corresponding to manuscripts and variables to the objects extracted from the Tag Field. """ - M = df.get() + M = df.copy() if "LABEL" not in M.columns: M.index = M["SR"] @@ -28,7 +28,7 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short # REMOVE TERMS AND MERGE SYNONYMS if Field in ["ID", "DE", "TI", "TI_TM", "AB", "AB_TM"]: - Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)]) + Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in str(x).split(sep)]) TERMS = pd.DataFrame({"item": [item.upper() for sublist in Fi for item in sublist], "SR": M.index.repeat(Fi.str.len())}) # Merge synonyms @@ -49,7 +49,7 @@ def cocMatrix(df, Field="AU", type="sparse", n=None, sep=";", binary=True, short M["CR"] = M["CR"].apply(lambda x: [ref.replace("DOI;", "DOI ") for ref in x] if isinstance(x, list) else x) if Field in M.columns: - Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in x.split(sep)]) + Fi = M[Field].fillna("").apply(lambda x: x if isinstance(x, list) else [i.strip() for i in str(x).split(sep)]) else: print(f"Field {Field} is not a column name of input data frame") return diff --git a/www/services/couplingmap.py b/www/services/couplingmap.py index a2b3628d7..f9e180001 100644 --- a/www/services/couplingmap.py +++ b/www/services/couplingmap.py @@ -16,7 +16,7 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, return None df = metaTagExtraction(df, "SR") # serve questo per avere il merging perfetto per uniformare la colonna SR - M = df.get() + M = df.copy() ngrams = int(ngrams) minfreq = max(0, int(minfreq * len(M) // 1000)) @@ -79,22 +79,23 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, DC = DC.reset_index(drop=True) # Raggruppa senza ambiguità - df_lab = DC.groupby('group', as_index=False).apply(lambda x: x.assign( - MNLCS2=x['MNLCS'].where(x['MNLCS'] >= 1), - MNLCS=round(x['MNLCS'], 2), - name=x['name'].str.lower(), - freq=len(x) - )).sort_values(by=['MNLCS'], ascending=False) + df_lab = DC.copy() + df_lab['MNLCS2'] = df_lab['MNLCS'].where(df_lab['MNLCS'] >= 1) + df_lab['MNLCS'] = df_lab.groupby('group')['MNLCS'].transform(lambda x: round(x, 2)) + df_lab['name'] = df_lab['name'].str.lower() + df_lab['freq'] = df_lab.groupby('group')['name'].transform('count') + df_lab = df_lab.sort_values(by=['MNLCS'], ascending=False) df = df_lab.groupby('group').apply(lambda x: pd.Series({ 'freq': x['freq'].iloc[0], 'centrality': x['pagerank_centrality'].mean() * 100, 'impact': np.nan_to_num(x['MNLCS2'].mean(skipna=True)), - 'label_cluster': x['group'].iloc[0], + 'label_cluster': x.name, 'color': x['color'].iloc[0], 'label': '\n'.join(x['name'].iloc[:min(n_labels, len(x))].tolist()), 'words': '\n'.join((x['name'] + ' ' + x['MNLCS'].astype(str)).tolist()) })).reset_index() + df['rcentrality'] = df['centrality'].rank() df['rimpact'] = df['impact'].rank() @@ -104,12 +105,12 @@ def couplingMap(df, analysis="documents", field="CR", n=500, minfreq=5, df = df[df['freq'] >= minfreq] df_lab = df_lab[df_lab['group'].isin(df['group'])] - df_lab = df_lab.iloc[:, [0, 6, 14, 7, 3]] + df_lab = df_lab[[analysis, 'cluster', 'freq', 'color', 'MNLCS']] df_lab.columns = [analysis, "Cluster", "ClusterFrequency", "ClusterColor", "NormalizedLocalCitationScore"] df_lab['ClusterName'] = df_lab['Cluster'].map(df.set_index('group')['label']) - M = M.drop(columns=['SR']).reset_index() + M = M.reset_index() if label_term is None: label_term = "null" @@ -314,7 +315,12 @@ def normalizeCitationScore(df, field="documents", impact_measure="local"): # Applica localCitations se richiesto if impact_measure == "local": - df = localCitations(df, fast_search=False, sep=";")['M'] + try: + df = localCitations(df, fast_search=False, sep=";")['M'] + except Exception as e: + import traceback + traceback.print_exc() + raise else: df['LCS'] = 0 @@ -435,8 +441,8 @@ def network(df, analysis, field, stemming, n, cluster, community_repulsion): def labeling(df, df_lab, term, n, n_labels, analysis, ngrams): # Se il termine è TI o AB, estrai termini if term in ["TI", "AB"]: - df = term_extraction(reactive.Value(df), field=term, ngrams=ngrams, verbose=False) - df = df.get() + df = term_extraction(df, field=term, ngrams=ngrams, verbose=False) + df = df.copy() term = f"{term}_TM" # Normalizzazione delle stringhe per evitare errori di merge @@ -517,7 +523,7 @@ def best_lab(df, tab_global, n_labels, term): def localCitations(df, fast_search=False, sep=";"): df = metaTagExtraction(df, "SR") - M = df.get() + M = df.copy() M['TC'] = M['TC'].fillna(0) if fast_search: loccit = M['TC'].quantile(0.75) @@ -525,6 +531,14 @@ def localCitations(df, fast_search=False, sep=";"): loccit = 1 H = histNetwork(df, min_citations=loccit, sep=sep, network=False) + if H is None: + M['LCS'] = 0 + CR = { + 'Authors': pd.DataFrame(columns=["Authors", "N. of Local Citations"]), + 'Papers': pd.DataFrame(columns=["Paper", "DOI", "Year", "LCS", "GCS"]), + 'M': M + } + return CR LCS = H['histData'] M = H['M'] diff --git a/www/services/etl/__init__.py b/www/services/etl/__init__.py new file mode 100644 index 000000000..5f9941761 --- /dev/null +++ b/www/services/etl/__init__.py @@ -0,0 +1,4 @@ +from .standardizer import convert2df +from .transformer import transform +from .validator import validate +from .api_retriever import retrieve_openalex, retrieve_pubmed diff --git a/www/services/etl/api_retriever.py b/www/services/etl/api_retriever.py new file mode 100644 index 000000000..3ae35586b --- /dev/null +++ b/www/services/etl/api_retriever.py @@ -0,0 +1,281 @@ +""" +API Retriever module for the Bibliometrix ETL pipeline. +Retrieves bibliographic data from PubMed and OpenAlex REST APIs. +Handles pagination, rate limits, and retrieves automatically. +""" +import requests +import time +import pandas as pd + +OPENALEX_BASE_URL = "https://api.openalex.org/works" +PUBMED_SEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +PUBMED_FETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" + +MAX_RETRIES = 3 +RETRY_DELAY = 2 +PAGE_SIZE = 25 + +def _get_with_retry(url:str, params:dict) -> dict: + """ + Perform a GET request with automatic retry on failure. + Waits RETRY_DELAY seconds between attempts + Args: + url:The endpoint URL to call. + params:Query parameters to include in the request. + Returns: + The parsed JSON response as a dictionary. + Raises: + RuntimeError: If all retry attempts fail. + """ + for attempt in range(1,MAX_RETRIES+1): + try: + response = requests.get(url, params=params, timeout=10) + response.raise_for_status() + return response.json() + except requests.RequestException as e: + print(f"[API] Attempt {attempt}/{MAX_RETRIES} failed: {e}") + if attempt < MAX_RETRIES: + time.sleep(RETRY_DELAY) + raise RuntimeError(f"All {MAX_RETRIES} attempts failed for URL: {url}") + +def _parse_openalex_record(work:dict) -> dict: + """ + Parse a single OpenAlex work record into a flat dictionary + using the standard WoS-compatible field names. + Args: + work:A single work object from the OpenAlex API response. + Returns: + A flat dictionary with standardized field names. + """ + authors = [] + authors_full = [] + affiliations = [] + for autorship in work.get("autorship",[]): + author_name = autorship.get("author",{}).get("display_name","") + authors.append(author_name) + authors_full.append(author_name) + for inst in autorship.get("institutions",[]): + affiliations.append(inst.get("display_name","")) + abstract = "" + inverted_index = work.get("abstract_inverted_index",{}) + if inverted_index: + words = [""] * (max( + pos for psitions in inverted_index.values() + for pos in psitions + )+1) + for word, positions in inverted_index.items(): + for pos in positions: + words[pos] = word + abstract = " ".join(words) + keywords = [kw.get("display_name","") for kw in work.get("keywords",[])] + cited_refs = work.get("referenced_works", []) + source = work.get("primary_location",{}) or {} + source_info = source.get("source",{}) or {} + journal = source.get("display_name","") + journal_abbr = source.get("issn_1","") + biblio = work.get("biblio",{}) or {} + doi = work.get("doi","") or "" + doi = doi.replace("https://doi.org/","") + return { + "TI":work.get("title",""), + "AB":abstract, + "PY":str(work.get("publication_year","")), + "SO":journal, + "JI":journal_abbr, + "VL":str(biblio.get("volume","") or ""), + "IS":str(biblio.get("issue","") or ""), + "BP":str(biblio.get("first_page","") or ""), + "EP":str(biblio.get("last_page","") or ""), + "DI":doi, + "UT":work.get("id",""), + "PMID":str(work.get("ids",{}).get("pmid","") or "").replace("https://pubmed.ncbi.nlm.nih.gov/",""), + "DT":work.get("type",""), + "LA":work.get("language",""), + "TC":work.get("cited_by_count",0), + "AU":authors, + "AF":authors_full, + "C1":affiliations, + "CR":cited_refs, + "DE":keywords, + "ID":keywords, + "RP":"", + "DB":"OPENALEX" + } + +def retrieve_openalex(query:str, max_results:int = 100) -> pd.DataFrame: + """ + Retrieve bibliographic records form OpenAlex for a given query. + Handles pagination automatically. + Args: + query:The search query string (e.g. 'machine learning'). + max_results: Maximum number of records to retrieve. + Returns: + A DataFrame with no row per record, using standard WoS field names. + """ + print(f"[OpenAlex] Searching for: '{query}' (max {max_results} results)") + records = [] + page = 1 + while len(records) < max_results: + params = { + "search":query, + "per-page":PAGE_SIZE, + "page":page, + "select":"id,title,abstract_inverted_index,publication_year,primary_location,biblio,doi,ids,type,language,cited_by_count,authorships,keywords,referenced_works", + } + data = _get_with_retry(OPENALEX_BASE_URL, params) + works = data.get("results",[]) + if not works: + print("[OpenAlex] No more results.") + break + for work in works: + records.append(_parse_openalex_record(work)) + if len(records) >= max_results: + break + print(f"[OpenAlex] Page {page}: retrieved {len(works)} records. Total so far: {len(records)}") + page += 1 + time.sleep(0.5) + print(f"[OpenAlex] Done. Total records retrieved: {len(records)}") + return pd.DataFrame(records) + +def _fetch_pubmed_ids(query:str, max_results:int) -> list: + """ + Search PubMed for a query and return a list of PubMed IDs. + Args: + query:The search query string. + max_results:Maximum number of IDs to retrieve. + Returns: + A list of PubMed ID strings. + """ + params = { + "db":"pubmed", + "term":query, + "retmax":max_results, + "retmode":"json" + } + data = _get_with_retry(PUBMED_SEARCH_URL,params) + ids = data.get("esearchresult", {}).get("idlist",[]) + print(f"[PubMed] Found {len(ids)} IDs.") + return ids + +def _fetch_pubmed_records(pmids:list) -> list: + """ + Fetch full records for a list of PubMed IDs in batches. + Args: + pmids:List of PubMed ID strings. + Returns: + A list of flat dictionaries with standardized field names. + """ + records = [] + batch_size = 20 + for i in range(0,len(pmids),batch_size): + batch = pmids[i:i+batch_size] + params = { + "db":"pubmed", + "id":",".join(batch), + "retmode":"text", + "rettype":"medline" + } + for attempt in range(1, MAX_RETRIES+1): + try: + response = requests.get(PUBMED_FETCH_URL, params=params, timeout=10) + response.raise_for_status() + break + except requests.RequestException as e: + print(f"[PubMed] Attempt {attempt}/{MAX_RETRIES} failed: {e}") + if attempt < MAX_RETRIES: + time.sleep(RETRY_DELAY) + else: + raise RuntimeError(f"Failed to fetch PubMed batch starting at index {i}") + records.extend(_parse_pubmed_text(response.text)) + print(f"[PubMed] Fetched batch {i//batch_size+1}.Total so far: {len(records)}") + time.sleep(0.5) + return records + +def _medline_to_standard(record:dict) -> dict: + """ + Convert a raw MEDLINE record dictionary to standard WoS field names. + Args: + record:A raw dictionary with MEDLINE field tags as keys. + Returns: + A flat dictionary with standardized WoS field names. + """ + authors = [a.strip() for a in record.get("AU","").split(";") if a.strip()] + authors_full = [a.strip() for a in record.get("FAU","").split(";") if a.strip()] + dp = record.get("DP","") + import re + year_match = re.search(r"\d{4}", dp) + year = year_match.group(0) if year_match else "" + keywords = [k.strip().replace("*","") for k in record.get("MH","").split(";") if k.strip()] + return { + "TI":record.get("TI", ""), + "AB":record.get("AB", ""), + "PY":year, + "SO":record.get("JT", ""), + "JI":record.get("TA", ""), + "VL":record.get("VI", ""), + "IS":record.get("IP", ""), + "BP":record.get("PG", "").split("-")[0] if record.get("PG") else "", + "EP":record.get("PG", "").split("-")[-1] if record.get("PG") else "", + "DI":record.get("LID", ""), + "UT":record.get("PMID", ""), + "PMID":record.get("PMID", ""), + "DT":record.get("PT", ""), + "LA":record.get("LA", ""), + "TC":0, + "AU":authors, + "AF":authors_full, + "C1":[a.strip() for a in record.get("AD", "").split(";") if a.strip()], + "CR":[], + "DE":keywords, + "ID":keywords, + "RP":"", + "DB":"PUBMED" + } + +def _parse_pubmed_text(text:str) -> list: + """ + Parse PubMed MEDLINE text format into a list of flat dictionaries. + Args: + text:Raw MEDLINE text from the PubMed efetch API. + Returns: + A list of flat dictionaries with standardized field names. + """ + records = [] + current = {} + current_key = None + for line in text.splitlines(): + if line.strip() == "": + if current: + records.append(_medline_to_standard(current)) + current = {} + current_key = None + continue + if line[:4].strip() and line [4:6] == "- ": + current_key = line[:4].strip() + value = line[6:].strip() + if current_key in current: + current[current_key] += ";" + value + else: + current[current_key] = value + if current: + records.append(_medline_to_standard(current)) + return records + +def retrieve_pubmed(query:str, max_results:int=100) -> pd.DataFrame: + """ + Retrieve bibliographic records from PubMed for a given query. + Args: + query:The search query string (e.g. 'machine learning'). + max_results:Maximum number of records to retrieve. + Returns: + A DataFrame with one row per record, using standard WoS field names. + """ + print(f"[PubMed] Searching for: '{query}' (max {max_results} results)") + pmids = _fetch_pubmed_ids(query, max_results) + if not pmids: + print("[PubMed] No results found.") + return pd.DataFrame() + records = _fetch_pubmed_records(pmids) + print(f"[PubMed] Done. Total records retrieved: {len(records)}") + return pd.DataFrame(records) + diff --git a/www/services/etl/mappings/__init__.py b/www/services/etl/mappings/__init__.py new file mode 100644 index 000000000..ca3638c31 --- /dev/null +++ b/www/services/etl/mappings/__init__.py @@ -0,0 +1,5 @@ +from .scopus_mapping import SCOPUS_CSV_MAPPING +from .dimensions_mapping import DIMENSIONS_MAPPING +from .pubmed_mapping import PUBMED_MAPPING +from .openalex_mapping import OPENALEX_MAPPING + diff --git a/www/services/etl/mappings/dimensions_mapping.py b/www/services/etl/mappings/dimensions_mapping.py new file mode 100644 index 000000000..850359ab2 --- /dev/null +++ b/www/services/etl/mappings/dimensions_mapping.py @@ -0,0 +1,21 @@ +""" +Mapping dictionary for Dimensions exported data. +Maps raw Dimensions column names to standard WoS field tags +""" +DIMENSIONS_MAPPING = { + "Title":"TI", + "Abstract":"AB", + "PubYear":"PY", + "Source title":"SO", + "Volume":"VL", + "Issue":"IS", + "Pagination":"BP", + "DOI":"DI", + "Publication ID":"UT", + "PMID":"PMID", + "Publication Type":"DT", + "Times cited":"TC", + "Authors":"AU", + "Corresponding Authors":"RP", + "MeSH terms":"DE" +} \ No newline at end of file diff --git a/www/services/etl/mappings/openalex_mapping.py b/www/services/etl/mappings/openalex_mapping.py new file mode 100644 index 000000000..f218c047a --- /dev/null +++ b/www/services/etl/mappings/openalex_mapping.py @@ -0,0 +1,21 @@ +""" +Mapping dictionary for OpenAlex API response data. +Maps raw OpenAlex field names to standard WoS field tags. +""" +OPENALEX_MAPPING = { + "title":"TI", + "abstract":"AB", + "publication_year":"PY", + "primary_location.source.display_name":"SO", + "primary_location.source.issn_1":"JI", + "biblio.volume":"VL", + "biblio.issue":"IS", + "biblio.first_page":"BP", + "biblio.last_page":"EP", + "doi":"DI", + "id":"UT", + "ids.pmid":"PMID", + "type":"DT", + "language":"LA", + "citied_by_count":"TC" +} \ No newline at end of file diff --git a/www/services/etl/mappings/pubmed_mapping.py b/www/services/etl/mappings/pubmed_mapping.py new file mode 100644 index 000000000..d669248ce --- /dev/null +++ b/www/services/etl/mappings/pubmed_mapping.py @@ -0,0 +1,23 @@ +""" +Mapping dictionary for PubMed exported data +Maps raw PubMed field tags to standard WoS field tags. +""" +PUBMED_MAPPING = { + "TI":"TI", + "AB":"AB", + "DP":"PY", + "JT":"SO", + "TA":"JI", + "VI":"VL", + "IP":"IS", + "PG":"BP", + "LID":"DI", + "PMID":"PMID", + "PT":"DT", + "LA":"LA", + "AU":"AU", + "FAU":"AF", + "AD":"C1", + "MH":"DE", + "GR":"FU" +} \ No newline at end of file diff --git a/www/services/etl/mappings/scopus_mapping.py b/www/services/etl/mappings/scopus_mapping.py new file mode 100644 index 000000000..b6327c17e --- /dev/null +++ b/www/services/etl/mappings/scopus_mapping.py @@ -0,0 +1,29 @@ +""" +Mapping dictionary for Scopus exported data. +Maps raw Scopus column names to standard WoS tags +""" + +SCOPUS_CSV_MAPPING = { + "Title":"TI", + "Abstract":"AB", + "Year":"PY", + "Source title":"SO", + "Abbreviated Source Title":"JI", + "Volume":"VL", + "Issue":"IS", + "Page start":"BP", + "Page end":"EP", + "DOI":"DI", + "EID":"UT", + "PubMed ID":"PMID", + "Document Type":"DT", + "Language of Original Document":"LA", + "Cited by":"TC", + "Authors":"AU", + "Author full names":"AF", + "Affiliations":"C1", + "Correspondence Address":"RP", + "References":"CR", + "Author Keywords":"DE", + "Index Keywords":"ID" +} \ No newline at end of file diff --git a/www/services/etl/standardizer.py b/www/services/etl/standardizer.py new file mode 100644 index 000000000..a9d40725a --- /dev/null +++ b/www/services/etl/standardizer.py @@ -0,0 +1,122 @@ +""" +Standardizer module for the Bibliometrix ETL pipeline. +This is the main point of the pipeline, equivalent to the convert2df() function in the R version of the Bibliometrix. +Usage: + from www.services.etl.standardizer import convert2df + #From a local file: + df = convert2df(source="scopus",filepath="data/scopus_export.csv") + #From an API query: + df = convert2df(source="openlax",query="machine learning",max_results=100) +""" +import pandas as pd +from www.services.etl.transformer import transform +from www.services.etl.validator import validate +from www.services.etl.api_retriever import retrieve_openalex, retrieve_pubmed +from www.services.etl.mappings import (SCOPUS_CSV_MAPPING, DIMENSIONS_MAPPING, PUBMED_MAPPING, OPENALEX_MAPPING) + +FILE_SOURCES = { + "scopus":SCOPUS_CSV_MAPPING, + "dimension":DIMENSIONS_MAPPING, + "pubmed":PUBMED_MAPPING +} + +API_SOURCES = { + "openalex":retrieve_openalex, + "pubmed":retrieve_pubmed +} + +def extract_file(source:str,filepath:str) -> pd.DataFrame: + """ + Extract raw data from a local file based on the source type. + Supports CSV, XLSX, and TXT (PubMed MEDLINE format). + Args: + source:The source database name (e.g. 'scopus','dimensions'). + filepath:The path to the local file to load. + Returns: + A raw DataFrame loaded from the file. + Raises: + ValueError:If the source or the file type is not supported. + """ + print(f"[EXTRACT] Loading file: {filepath} (source: {source})") + if source == "scopus": + if filepath.endswith(".csv"): + return pd.read_csv(filepath,encoding="utf-8") + else: + raise ValueError(f"Scopus only supports .csv files. Go: {filepath}") + elif source == "dimensions": + if filepath.endswith(".xlsx"): + return pd.read_excel(filepath,skiprows=1) + elif filepath.endswith(".csv"): + return pd.read_csv(filepath,skiprows=1,encoding="utf-8") + else: + raise ValueError(f"Dimensions only support .csv or .xlsx files.Got: {filepath}") + elif source == "pubmed": + if filepath.endswith(".txt"): + from www.services.etl.api_retriever import _parse_pubmed_text + with open(filepath, "r", encoding="utf-8") as f: + text = f.read() + records = _parse_pubmed_text(text) + return pd.DataFrame(records) + else: + raise ValueError(f"PubMed only supports .txt files.Got: {filepath}") + else: + raise ValueError(f"Unsupported file source: '{source}'." + f"Supported sources: {list(FILE_SOURCES.keys())}") + +def convert2df( + source:str, + filepath:str=None, + query:str=None, + max_results:int=100, + run_validation:bool=True, +) -> pd.DataFrame: + """ + Main entry point of the Bibliometrix ETL pipeline. + Converts heterogeneous bibliographic data into a standardized DataFrame. + Equivalent to the convert2df() function in the R version of the Bibliometrix. + Can operate in two modes: + - FILE MODE:loads a manually exported file (Base Level) + - API MODE:retrieves data automatically via REST API (Advanced Level) + Args: + source:The data source. Supported values: + File mode: 'scopus','dimensions';'pubmed' + API mode:'openalex','pubmed' + filepath:Path to the local file (required for file mode). + query:Search query string (required for API mode). + max_results:Maximum number of records to retrieve (API mode only). + run_validation:If True, runs the validator before returning the DataFrame. + Returns: + A fully standardized pandas DataFrame ready for Bibliometrix analysis. + Raises: + ValueError: If neither filepath nor query is provided, or if the source is not supported. + Examples: + >>> df = convert2df(source="scopus",filepath="scopus_sxport.csv") + >>> df = convert2df(source="openalex",query="deep learning",max_results=50) + """ + print(f"[convert2df] Source: {source} | Mode: {'API' if query else 'FILE'}") + if query is not None: + if source not in API_SOURCES: + raise ValueError(f"API not supported for source: '{source}'. Supported API sources: {list(API_SOURCES.keys())}") + retriever = API_SOURCES[source] + df = retriever(query=query, max_results=max_results) + if source == "pubmed": + mapping = PUBMED_MAPPING + elif source == "openalex": + mapping = OPENALEX_MAPPING + else: + mapping = {} + db_name = source.upper() + elif filepath is not None: + if source not in FILE_SOURCES: + raise ValueError(f"File mode not supported for source: '{source}'. Supported file sources: {list(FILE_SOURCES.keys())}") + df = extract_file(source, filepath) + mapping = FILE_SOURCES[source] + db_name = source.upper() + else: + raise ValueError("You must provide either 'filepath' (file mode) or 'query' (API mode).") + df = transform(df,mapping,db_name) + if run_validation: + validate(df) + return df + + diff --git a/www/services/etl/transformer.py b/www/services/etl/transformer.py new file mode 100644 index 000000000..90bd374fd --- /dev/null +++ b/www/services/etl/transformer.py @@ -0,0 +1,175 @@ +""" +Transformer module for the Bibliometrix ETL pipeline. +Handles column renaming, type enforcement, null handling, +and derived field calculation. +""" +import pandas as pd +import re + +LIST_COLUMNS = ["AU","AF","C1","CR","DE","ID"] +INT_COLUMNS = ["TC"] +COLUMN_DEFAULT = { + "DB":"", + "UT":"", + "DI":"", + "PMID":"", + "TI":"", + "SO":"", + "JI":"", + "PY":"", + "DT":"", + "LA":"", + "TC":0, + "AU":[], + "AF":[], + "C1":[], + "RP":"", + "CR":[], + "DE":[], + "ID":[], + "AB":"", + "VL":"", + "IS":"", + "BP":"", + "EP":"", + "SR":"" +} + +def rename_columns(df:pd.DataFrame, mapping:dict) ->pd.DataFrame: + """ + Rename raw source columns to standard WoS field tags using a mapping dictionary. + Args: + df: The raw DataFrame from the source. + mapping: A dictionary mapping raw column names to WoS tags. + Returns: + A new DataFrame with renamed columns. + """ + existing_mapping = {k: v for k,v in mapping.items() if k in df.columns} + return df.rename(columns=existing_mapping) + +def enforce_list_columns(df:pd.DataFrame) -> pd.DataFrame: + """ + Ensure that all list columns contain Python lists of strings. + Splits string values using semicolon as delimeter. + Args: + df: The DataFrame after column renaming. + Returns: + The DataFrame with list columns properly typed. + """ + for col in LIST_COLUMNS: + if col in df.columns: + def to_list(val): + if isinstance(val,list): + return val + if pd.isna(val) or val == "" or val is None: + return [] + if isinstance(val,str): + return [item.strip() for item in val.split(";") if item.strip()] + return [str(val)] + df[col] = df[col].apply(to_list) + return df + +def enforce_int_columns(df:pd.DataFrame) -> pd.DataFrame: + """ + Ensure that integer columns are properly cast to integers. + Replaces nulls with 0. + Args: + df:The DataFrame after columns renaming. + Returns: + The DataFrame with integer columns properly typed. + """ + for col in INT_COLUMNS: + if col in df.columns: + df[col] = pd.to_numeric(df[col],errors="coerce").fillna(0).astype(int) + return df + +def fill_missing_columns(df:pd.DataFrame, db_name:str) -> pd.DataFrame: + """ + Add any missing mandatory columns with their default empty values. + Also sets the DB column to identify the data source. + Args: + df:The DataFrame after type enforcement. + db_name:The name of the source database (e.g. 'SCOPUS', 'PUBMED'). + Returns: + The DataFrame with all mandatory columns present. + """ + for col,default in COLUMN_DEFAULT.items(): + if col not in df.columns: + if isinstance(default,list): + df[col] = [[] for _ in range(len(df))] + else: + df[col] = default + df["DB"] = db_name.upper() + return df + +def fill_null_values(df:pd.DataFrame) -> pd.DataFrame: + """ + Replace all remaining NaN and None values with appropriate defaluts. + List columns get empty lists, all others get empty strings. + Args: + df:The DataFrame before final export. + Returns: + The DataFrame with no null values remaining. + """ + for col in df.columns: + if col in LIST_COLUMNS: + df[col] = df[col].apply(lambda x: x if isinstance(x,list) else []) + elif col == "TC": + df[col] = df[col].fillna(0) + else: + df[col] = df[col].fillna("") + return df + +def calculate_sr(df:pd.DataFrame) -> pd.DataFrame: + """ + Calculate the Short Reference (SR) field for each record. + Format: 'FirstAuthorSurname, PublicationYear, JournalName' + This field is used as a primary key in a citation network analyses. + Args: + df:The DataFrame with AU, PY, and SO columns populated. + Returns: + The DataFrame with the SR column filled. + """ + def build_sr(row): + authors = row.get("AU", []) + if isinstance(authors,list) and len(authors)>0: + first_author = authors[0].split(",")[0].strip() + else: + first_author = "" + year = str(row.get("PY","")).strip() + journal = str(row.get("SO","")).strip() + return f"{first_author}, {year}, {journal}" + df["SR"] = df.apply(build_sr, axis=1) + return df + +def transform(df:pd.DataFrame, mapping:dict, db_name:str) -> pd.DataFrame: + """ + Run the full transformation pipeline on a row DataFrame. + Steps: rename -> enforce types -> fill missing columns -> fill nulls -> calculate SR. + Args: + df:The raw DataFrame loaded from the source file or API. + mapping:The column mapping dictionary for this source. + db_name:The name of the source database. + Returns: + A fully standardized DataFrame ready for validation and analysis. + """ + print(f"[TRANSFORM] Starting transformation for source: {db_name}") + df = rename_columns(df,mapping) + print("[TRANSFORM] Columns renamed.") + df = enforce_list_columns(df) + print("[TRANSFORM] List columns enforced") + df = enforce_int_columns(df) + print("[TRANSFORM] Integer columns enforced.") + df = fill_missing_columns(df, db_name) + print("[TRANSFORM] Missing columns filled.") + df = fill_null_values(df) + print("[TRANSFORM] Null values filled.") + df = calculate_sr(df) + print("[TRANSFORM] SR field calculated.") + print(f"[TRANSFORM] Done. Shape: {df.shape}.") + return df + + + + + \ No newline at end of file diff --git a/www/services/etl/validator.py b/www/services/etl/validator.py new file mode 100644 index 000000000..98f881d97 --- /dev/null +++ b/www/services/etl/validator.py @@ -0,0 +1,92 @@ +""" +Validator module for the Bibliometrix ETL pipeline. +Checks that the standardized DataFrame meets the required schema +before it is passed to the analytical functions. +""" +import pandas as pd + +MANDATORY_COLUMNS = [ + "DB","UT","DI","PMID","TI","SO","JI","PY","DT","LA","TC","AU","AF","C1","RP","CR","DE","ID","AB","VL","IS","BP","EP","SR" +] + +LIST_COLUMNS = ["AU","AF","C1","CR","DE","ID"] + +def check_mandatory_columns(df: pd.DataFrame) -> list: + """ + Check that all mandatory columns are present in the DataFrame. + Args: + df:The standardized DataFrame to validate. + Returns: + A list of missing column names. Empty list means all columns are present. + """ + missing = [col for col in MANDATORY_COLUMNS if col not in df.columns] + return missing + +def check_no_nulls(df:pd.DataFrame) -> dict: + """ + Check that no Nan or None value remain in the DataFrame. + Args: + df:The standardized DataFrame to validate. + Returns: + A dictionary mapping column names to the count of null values found. + Empty dict means no nulls found + """ + null_counts = {} + for col in df.columns: + count = df[col].isna().sum() + if count > 0: + null_counts[col] = int(count) + return null_counts + +def check_list_columns(df:pd.DataFrame) -> list: + """ + Check that list columns contain actual Python lists and not strings. + Args: + df:The standardized DataFrame to validate. + Returns: + A list of column name where the type contract is violated. + """ + violations = [] + for col in LIST_COLUMNS: + if col in df.columns: + sample = df[col].dropna() + for val in sample: + if not isinstance(val,list): + violations.append(col) + break + return violations + +def validate(df: pd.DataFrame) -> bool: + """ + Run all validation checks on the standardized DataFrame. + Prints a report of any issues found. + Args: + df:The standardized DataFrame to validate. + Returns: + True if all checks pass, False if any check fails. + """ + print("---Running ETL Validation---") + passed = True + missing_cols = check_mandatory_columns(df) + if missing_cols: + print(f"[FAIL] Missing mandatory columns: {missing_cols}") + passed = False + else: + print("[OK] All mandatory columns are present.") + null_counts = check_no_nulls(df) + if null_counts: + print(f"[FAIL] Null values found: {null_counts}") + passed = False + else: + print("[OK] No null values found.") + violations = check_list_columns(df) + if violations: + print(f"[FAIL] List type violations in columns: {violations}") + passed = False + else: + print("[OK] All list columns are correctly typed.") + if passed: + print("---Validation PASSED---") + else: + print("---Validation FAILED---") + return passed diff --git a/www/services/histnetwork.py b/www/services/histnetwork.py index 7848d9744..72b5591e9 100644 --- a/www/services/histnetwork.py +++ b/www/services/histnetwork.py @@ -19,7 +19,9 @@ def histNetwork(df, min_citations=0, sep=";", network=True): - M: A DataFrame containing the metadata of the papers with the Local Citation Score (LCS). - LCS: A list containing the Local Citation Score of each paper. """ - M = df.get() + from .metatagextraction import metaTagExtraction + df = metaTagExtraction(df, "SR") + M = df.copy() db = M['DB'][0] # Ensure required fields are present diff --git a/www/services/histplot.py b/www/services/histplot.py index fb5c472f7..d46f920d9 100644 --- a/www/services/histplot.py +++ b/www/services/histplot.py @@ -31,7 +31,7 @@ def histPlot(histResults, n=20, size=5, labelsize=5, remove_isolates=True, title # Selezioniamo il valore di soglia s sorted_LCS = LCS.sort_values(ascending=False) - s = sorted_LCS.iloc[min(n, len(sorted_LCS))] + s = sorted_LCS.iloc[min(n, len(sorted_LCS)) - 1] # Troviamo gli indici (etichette) che soddisfano la condizione LCS >= s selected_columns = sorted_LCS[sorted_LCS >= s].index.tolist() diff --git a/www/services/metatagextraction.py b/www/services/metatagextraction.py index 5e1f8b9c8..94efbca82 100644 --- a/www/services/metatagextraction.py +++ b/www/services/metatagextraction.py @@ -14,7 +14,7 @@ def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False): Returns: A DataFrame with the extracted metadata tags. """ - M = df.get() + M = df.copy() if Field == "SR": M = SR(M) @@ -40,10 +40,8 @@ def metaTagExtraction(df, Field="AU_CO", sep=";", aff_disamb=False): ind = M["AU1_UN"].str.find("),") a = ind[ind > -1].index M.loc[a, "AU1_UN"] = M.loc[a, "AU1_UN"].str[ind[a] + 2:] - - df.set(M) - return df + return M def SR(M): diff --git a/www/services/networkplot.py b/www/services/networkplot.py index 156cfbfd0..b9d545380 100644 --- a/www/services/networkplot.py +++ b/www/services/networkplot.py @@ -1,6 +1,6 @@ from .utils import * from .cocmatrix import * - +import builtins def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", type="auto", label=True, labelsize=1, label_cex=False, label_color=False, label_n=None, halo=False, @@ -22,6 +22,7 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t # Create igraph object bsk_network = ig.Graph.Weighted_Adjacency(NetMatrix.values.tolist(), mode=ig.ADJ_UNDIRECTED, attr="weight") bsk_network.vs["name"] = NetMatrix.columns + print(f"Nodes after creation: {len(bsk_network.vs)}") # Compute node degrees deg = np.array(bsk_network.degree()) @@ -60,11 +61,13 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t bsk_network.delete_vertices(indices_to_delete) if bsk_S is not None: bsk_S.delete_vertices(indices_to_delete) + print(f"Nodes after n filter: {len(bsk_network.vs)}") # Simplify the graph if edges_min > 1: remove_multiple = False bsk_network.simplify(multiple=remove_multiple, loops=noloops) + print(f"Nodes after simplify: {len(bsk_network.vs)}") if bsk_S is not None: bsk_S.simplify(multiple=remove_multiple, loops=noloops) @@ -98,6 +101,16 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t if bsk_S is not None: isolates_to_remove = [v.index for v in bsk_S.vs if v["name"] not in bsk_network.vs["name"]] bsk_S.delete_vertices(isolates_to_remove) + print(f"Nodes after remove_isolates: {len(bsk_network.vs)}") + + if len(bsk_network.vs) == 0: + print("Warning: Network is empty after filtering.") + return { + "S": None, + "graph": bsk_network, + "cluster_res": pd.DataFrame(), + "cluster_obj": builtins.type('obj', (object,), {'membership': []})() + } # Apply clustering cl = clustering_network(bsk_network, cluster) @@ -180,6 +193,7 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t else: cluster_res = None + return { "S": S, "graph": bsk_network, @@ -188,6 +202,8 @@ def network_plot(NetMatrix, normalize=None, n=None, degree=None, Title="Plot", t } + + def delete_isolates(graph, mode='all'): isolates = [v.index for v in graph.vs if graph.degree(v, mode=mode) == 0] graph.delete_vertices(isolates) @@ -279,10 +295,15 @@ def switch_layout(bsk_network, type, community_repulsion): # Normalizza manualmente il layout l_coords = np.array(l.coords) - min_coords = l_coords.min(axis=0) - max_coords = l_coords.max(axis=0) - normalized_coords = (l_coords - min_coords) / (max_coords - min_coords) - l = ig.Layout(normalized_coords.tolist()) + if len(l_coords) == 0: + l = ig.Layout([[0,0]]) + else: + min_coords = l_coords.min(axis=0) + max_coords = l_coords.max(axis=0) + range_coords = max_coords - min_coords + range_coords[range_coords==0] = 1 + normalized_coords = (l_coords - min_coords) / range_coords + l = ig.Layout(normalized_coords.tolist()) return {"l": l, "bsk_network": bsk_network} diff --git a/www/services/termextraction.py b/www/services/termextraction.py index f7d9a52c1..5ccc921b9 100644 --- a/www/services/termextraction.py +++ b/www/services/termextraction.py @@ -20,7 +20,7 @@ def term_extraction(df, field="TI", ngrams=1, stemming=False, language="english" Returns: A DataFrame with the extracted terms. """ - M = df.get() + M = df.copy() # Load and update stopwords overall_start_time = time.time() @@ -97,7 +97,4 @@ def term_extraction(df, field="TI", ngrams=1, stemming=False, language="english" if verbose: print(terms_df.sum().sort_values(ascending=False).head(25)) - # Finalize the output - df.set(M) - - return df + return M diff --git a/www/services/thematicmap.py b/www/services/thematicmap.py index 3c313b7f6..ab33123b1 100644 --- a/www/services/thematicmap.py +++ b/www/services/thematicmap.py @@ -2,12 +2,16 @@ from .igraph2vis import * from .termextraction import * from .biblionetwork import * +from pyvis.network import Network +import tempfile +import os +from www.services.couplingmap import avoid_net_overlaps def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, size=0.5, n_labels=1, community_repulsion=0.1, repel=True, remove_terms=None, synonyms=None, cluster="walktrap", subgraphs=False): # df = metaTagExtraction(df, field=field) M = df - m = df.get() + m = df.copy() # Set ngrams based on field ngrams = int(ngrams) if field in ['TI', 'AB'] else 1 @@ -78,17 +82,21 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz }) # Filter and process cluster data + print("df_lab before groupby:", df_lab.columns.tolist()) + print("df_lab shape:", df_lab.shape) df_lab = (df_lab[df_lab['sC'] >= minfreq] - .groupby('groups') + .groupby('groups', group_keys=False) .apply(lambda x: pd.Series({ 'freq': x['sC'].sum(), 'cluster_label': x.loc[x['sC'].idxmax(), 'words'], - 'sC': list(x['sC']), # Se necessario mantenere i valori di sC - 'words': ', '.join(x['words'].astype(str)), # <-- Converte in stringa pulita - 'color': x['color'].iloc[0] # Prende il primo valore della colonna + 'sC': list(x['sC']), + 'words': ', '.join(x['words'].astype(str)), + 'color': x['color'].iloc[0], + 'groups': x.name })) - .reset_index()) - + .reset_index(drop=True)) + print("df_lab columns:", df_lab.columns.tolist()) + print("df_lab head:", df_lab.head()) # Explode both words and sC columns to create rows for each word and its occurrence count df_lab = df_lab.assign( words=df_lab['words'].str.split(', '), @@ -117,16 +125,10 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz df_lab['words'] = df_lab['words'].astype(str) # Perform left joins equivalent to R's left_join operations - sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']], - left_on='words1', - right_on='words', - how='left') - sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']], - left_on='words2', - right_on='words', - how='left', - suffixes=('', '2')) - + sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']].rename(columns={'words': 'words1_match', 'groups': 'groups1'}), + left_on='words1', right_on='words1_match', how='left').drop(columns=['words1_match'], errors='ignore') + sEij_df = sEij_df.merge(df_lab_top[['words', 'groups']].rename(columns={'words': 'words2_match', 'groups': 'groups2'}), + left_on='words2', right_on='words2_match', how='left').drop(columns=['words2_match'], errors='ignore') # Drop the extra 'words' columns created by the merge sEij_df = sEij_df.drop(['words', 'words_y'], axis=1, errors='ignore') @@ -135,6 +137,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz .groupby('groups') .first() .reset_index()) + print("df_lab_top columns:", df_lab_top.columns.tolist()) # Remove duplicate columns sEij_df = sEij_df.loc[:, ~sEij_df.columns.duplicated()] @@ -165,8 +168,8 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz # 3. Filtra correttamente i dati df = ( filtered_df - .assign(ext=lambda x: (x['groups'] != x['groups2']).astype(int)) - .groupby('groups') + .assign(ext=lambda x: (x['groups1'] != x['groups2']).astype(int)) + .groupby('groups1') .agg({ 'words1': lambda x: len(set(x)), 'eij': lambda x: sum(x * x.index), # calculate centrality as sum(eij*ext) @@ -178,14 +181,16 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz 'ext': 'CallonDensity' }) .assign( - CallonDensity=lambda x: x['CallonDensity'] / x['n'] * 100, + CallonDensity=lambda x: x['CallonDensity'] / x['n'].astype(float) * 100, RankCentrality=lambda x: x['CallonCentrality'].rank(), RankDensity=lambda x: x['CallonDensity'].rank() ) - .merge(df_lab_top, on='groups', how='left') - .rename(columns={'cluster_label': 'Cluster', 'freq': 'ClusterFrequency'}) .reset_index() + .rename(columns={'cluster_label': 'Cluster', 'freq': 'ClusterFrequency'}) + .merge(df_lab_top, left_on='groups1', right_on='groups', how='left') + .rename(columns={'cluster_label':'Cluster', 'freq':'ClusterFrequency'}) ) + print("df columns:", df.columns.tolist()) # Calculate plot parameters meandens = df['RankDensity'].mean() @@ -320,7 +325,14 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz ############################################################################################################################################## # Rename and rearrange columns in df_lab - df_lab.columns = ['Cluster', 'Cluster_Frequency', 'Cluster_Label', 'Occurrences', 'Words', 'Color'] + df_lab = df_lab.rename(columns = { + 'freq': 'Cluster_Frequency', + 'cluster_label':'Cluster_Label', + 'sC':'Occurrences', + 'words':'Words', + 'color':'Color', + 'groups':'Cluster' + }) df_lab = (df_lab .sort_values('Cluster') .dropna(subset=['Color']) @@ -396,7 +408,7 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz # Generate layout # Using default igraph layout - layout = Net['graph']['layout'] + layout = Net['graph'].layout_fruchterman_reingold() # Get coordinates from layout coords = np.array([[pos[0], pos[1]] for pos in layout]) @@ -560,7 +572,11 @@ def thematic_map(df, field="ID", n=250, minfreq=5, ngrams=1, stemming=False, siz def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, threshold=0.5): + print("cluster_assignment called, field:", field) + print("cluster_assignment M columns:", M.columns.tolist()) # Integrate stopwords and synonyms in M original field + from www.services.metatagextraction import metaTagExtraction + M = metaTagExtraction(M, "SR") if field in ["AB", "TI"]: field = f"{field}_TM" @@ -571,13 +587,22 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh all_terms = [] all_sr = [] + if 'SR' not in M.columns: + from www.services.metatagextraction import metaTagExtraction + M = metaTagExtraction(M, "SR") + # Iterate through each row for i, terms_list in enumerate(Fi): if isinstance(terms_list, list): - for term in terms_list: - if term: # Skip empty terms - all_terms.append(term.strip()) - all_sr.append(M['SR'].iloc[i]) + items = terms_list + elif isinstance(terms_list, str): + items = [t.strip() for t in terms_list.split(';')] + else: + items = [] + for term in items: + if term: # Skip empty terms + all_terms.append(term.strip()) + all_sr.append(M['SR'].iloc[i]) all_field = pd.DataFrame({ 'terms': all_terms, @@ -605,7 +630,7 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh # Process words dataframe words = words.assign( - p_w=1/words['Occurrences'], + p_w=1/words['Occurrences'].astype(float), p_c=words['pagerank_centrality'] ) @@ -623,13 +648,13 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh ) # Calculate probabilities - terms = (terms.groupby('SR') - .apply(lambda x: x.assign(pagerank=x['p_c'].sum())) - .reset_index(drop=True) - .groupby(['SR', 'Cluster_Label']) - .agg({'p_w': 'sum', 'p_c': 'max'}) + print("before groupby SR Cluster_Label, terms columns:", terms.columns.tolist()) + terms['pagerank'] = terms.groupby('SR')['p_c'].transform('sum') + terms = (terms.groupby(['SR', 'Cluster_Label']) + .agg({'p_w': 'sum', 'p_c':'max'}) .reset_index() - .rename(columns={'p_c': 'pagerank'})) + .rename(columns={'p_c':'pagerank'})) + print("after groupby terms columns:", terms.columns.tolist()) terms['p'] = terms['p_w'] / terms.groupby('SR')['p_w'].transform('sum') terms = terms.dropna(subset=['Cluster_Label']).drop('p_w', axis=1) @@ -645,20 +670,26 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh terms_pagerank = (terms.merge(terms_max, on='SR') .query('Cluster_Label == Assigned_cluster')[['SR', 'pagerank']]) + print("before pivot, terms columns:", terms.columns.tolist()) + # Pivot and merge results terms = (terms.drop('pagerank', axis=1) .pivot(index='SR', columns='Cluster_Label', values='p') .reset_index() # Ensure SR is only a column .rename_axis(None, axis=1) # Remove any index name ) + print("after pivot, terms columns:", terms.columns.tolist()) # Now merge with terms_max and terms_pagerank terms = terms.merge(terms_max, on='SR').merge(terms_pagerank, on='SR') - + print("after merge terms_max, terms columns:", terms.columns.tolist()) # Process final results if 'DI' not in M.columns: M['DI'] = np.nan year = pd.Timestamp.now().year + 1 + print("terms columns before final merge:", terms.columns.tolist()) + print("SR_cited in terms:", 'SR' in terms.columns) + M = M.reset_index(drop=True) terms = (M.assign( TCpY=lambda x: x['TC']/(year-x['PY']), @@ -669,5 +700,6 @@ def cluster_assignment(M, words, field, remove_terms=None, synonyms=None, thresh .groupby('Assigned_cluster') .apply(lambda x: x.sort_values('TC', ascending=False)) .reset_index(drop=True)) + print("terms done") return terms