Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
85 changes: 85 additions & 0 deletions ETL_Execution_Evidence.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# ETL Pipeline Execution Evidence\n",
"This notebook demonstrates the execution of the custom ETL pipeline retrieving data from the OpenAlex API, standardizing it, validating it, and preparing it for the Bibliometrix dashboard."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import pandas as pd\n",
"# Ensure local modules can be imported\n",
"sys.path.append(os.path.abspath(\".\"))\n",
"\n",
"from www.services.etl import ETLPipeline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Live Query Execution via API"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"machine learning\"\n",
"print(f\"Executing live API query to OpenAlex for: {query}...\")\n",
"df_standardized = ETLPipeline.convert2df(source_data=\"API\", source_type=\"OpenAlex\", is_api=True, query=query)\n",
"print(f\"\\nSuccessfully retrieved and standardized {len(df_standardized)} records.\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Validation and Normalized Output\n",
"Displaying the first 5 normalized rows demonstrating standard Web of Science columns (e.g., UT, TI, CR, PY)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pd.set_option(\"display.max_columns\", None)\n",
"df_standardized.head(5)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
276 changes: 162 additions & 114 deletions app.py

Large diffs are not rendered by default.

Binary file added execution_log.txt
Binary file not shown.
11 changes: 7 additions & 4 deletions functions/get_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,21 +42,24 @@ def get_data(input, database, df, reset_callback=None):
else:
# Process single file (original logic)
type = file[0]["name"]
json = biblio_json(file[0]["datapath"], source, type, author)
df.set(pd.read_json(StringIO(json)))

# Base Level: Bypass the fragile legacy 'biblio_json' parser and directly use our ETL Pipeline!
clean_df = ETLPipeline.convert2df(source_data=file[0]["datapath"], source_type=source, is_api=False, original_filename=type)
df.set(clean_df)

# Reset all analysis results when new dataset is loaded
if reset_callback:
reset_callback()

if type.endswith(".zip"):
text = ui.p(
f"{database}'s ZIP archive uploaded and extracted successfully! "
f"{database}'s ZIP archive uploaded, extracted, and Standardized successfully! "
f"Multiple files have been processed and combined. "
f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns."
)
else:
text = ui.p(
f"{database}'s file uploaded successfully! You can now proceed to analyze your data. "
f"{database}'s file uploaded and Standardized successfully! You can now proceed to analyze your data. "
f"The dataset contains {df.get().shape[0]} rows and {df.get().shape[1]} columns."
)
except Exception as e:
Expand Down
91 changes: 91 additions & 0 deletions generate_notebook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import json

notebook = {
'cells': [
{
'cell_type': 'markdown',
'metadata': {},
'source': [
'# ETL Pipeline Execution Evidence\n',
'This notebook demonstrates the execution of the custom ETL pipeline retrieving data from the OpenAlex API, standardizing it, validating it, and preparing it for the Bibliometrix dashboard.'
]
},
{
'cell_type': 'code',
'execution_count': None,
'metadata': {},
'outputs': [],
'source': [
'import sys\n',
'import os\n',
'import pandas as pd\n',
'# Ensure local modules can be imported\n',
'sys.path.append(os.path.abspath("."))\n',
'\n',
'from www.services.etl import ETLPipeline'
]
},
{
'cell_type': 'markdown',
'metadata': {},
'source': [
'## Live Query Execution via API'
]
},
{
'cell_type': 'code',
'execution_count': None,
'metadata': {},
'outputs': [],
'source': [
'query = "machine learning"\n',
'print(f"Executing live API query to OpenAlex for: {query}...")\n',
'df_standardized = ETLPipeline.convert2df(source_data="API", source_type="OpenAlex", is_api=True, query=query)\n',
'print(f"\\nSuccessfully retrieved and standardized {len(df_standardized)} records.")'
]
},
{
'cell_type': 'markdown',
'metadata': {},
'source': [
'## Validation and Normalized Output\n',
'Displaying the first 5 normalized rows demonstrating standard Web of Science columns (e.g., UT, TI, CR, PY).'
]
},
{
'cell_type': 'code',
'execution_count': None,
'metadata': {},
'outputs': [],
'source': [
'pd.set_option("display.max_columns", None)\n',
'df_standardized.head(5)'
]
}
],
'metadata': {
'kernelspec': {
'display_name': 'Python 3',
'language': 'python',
'name': 'python3'
},
'language_info': {
'codemirror_mode': {
'name': 'ipython',
'version': 3
},
'file_extension': '.py',
'mimetype': 'text/x-python',
'name': 'python',
'nbconvert_exporter': 'python',
'pygments_lexer': 'ipython3',
'version': '3.12.0'
}
},
'nbformat': 4,
'nbformat_minor': 4
}

with open('ETL_Execution_Evidence.ipynb', 'w') as f:
json.dump(notebook, f, indent=1)
print('Notebook created successfully.')
5 changes: 5 additions & 0 deletions run.bat
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
@echo off
echo Starting Biblioshiny Dashboard...
echo Your web browser will open automatically.
python -m shiny run --launch-browser app.py
pause
Loading