Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 1 addition & 14 deletions dopull/dopull.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
except ImportError:
pwd = None

VERSION = "2026.05.29"
VERSION = "2026.06.09"

SCRIPT_DIR = Path(__file__).resolve().parent
# Parent directory of where to look for files to push out.
Expand All @@ -33,11 +33,9 @@
LOGFILE = Path(os.getenv("LOGFILE", str(SCRIPT_DIR / "logs/dopull.log")))
# Lock file to prevent multiple dopulls running at the same time.
PULLRUNNING = Path(os.getenv("PULLRUNNING", str(SCRIPT_DIR / ".dopull-running")))
# Trigger directory for JSON processing on ibiblio (kept for compatibility with shell config).
IBIBLIO = "gutenberg.login.ibiblio.org"
PRIVATE = os.getenv('PRIVATE') or ''
IBIBLIO_DOPULL_DIR = os.path.join(PRIVATE, 'logs', 'dopull')
IBIBLIO_JSON_DIR = os.path.join(PRIVATE, 'logs', 'json')
# Email address to send trouble reports to.
BOSS = os.getenv("BOSS", "pterodactyl@fastmail.com")
LOGGER = logging.getLogger("dopull")
Expand Down Expand Up @@ -125,7 +123,6 @@ def main() -> int:
• For each trigger file found in "push" directory,
◦ Get owner of file (user)
◦ Trigger ebook update by copying it to the ibiblio dopull dir.
◦ If file is .json, trigger ebook indexing by copying it to the ibiblio JSON dir.
◦ Move file to DONE archive
◦ Send success/fail email to user
"""
Expand Down Expand Up @@ -179,16 +176,6 @@ def process_trigger_file(trigger_file: Path) -> str:
append_out(f"Failed to trigger ibiblio update for {filename}: {e}")
return "failure"

# Handle .json files for ebook indexing.
if trigger_file.suffix.lower() == ".json":
try:
dest = f"{IBIBLIO}:{IBIBLIO_JSON_DIR}/{filename}"
subprocess.run(["scp", str(trigger_file), dest], check=True)
append_out(f"Copied {filename} to ibiblio to trigger ebook indexing.")
except Exception as e:
append_out(f"Failed to trigger ebook indexing for {filename}: {e}")
return "failure"

# If we got to here, all is OK, move trigger file to the DONE directory,
# otherwise, it will be retried on the next run.
try:
Expand Down
60 changes: 49 additions & 11 deletions puller.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,11 +42,24 @@
# These are where .zip.trig files go on ibiblio :
DOPULL_LOG_DIR = os.path.join(PRIVATE, 'logs', 'dopull')
DOPUSH_LOG_DIR = os.path.join(PRIVATE, 'logs', 'dopush')
JSON_LOG_DIR = os.path.join(PRIVATE, 'logs', 'json')


def scan_dopull_log():
"""
Scan the dopull log directory for new files.
"""
Scan DOPULL_LOG_DIR for new files.
Note: this does 3 things:
1. For all trigger files, it pulls the latest files from the upstream repo into the FILES directory.
2. Moves .json files to JSON_LOG_DIR for database processing, and creates an .info.txt trigger file.
3. Moves .zip.trig files to DOPUSH_LOG_DIR for database updates.
Both directories are processed by FileInfo.py. In the future, it should be updated to do the appropriate
processing for each file type, but for now this is a simple way to get the files where they need to go
without needing to change FileInfo.py.

If both .zip.trig and .json files are present for the same ebook number:
(Workflow creates a .json, file, then Errata Workbench creates a .zip.trig file)
should be OK, the repo has all the changes, and we need the trigger file in any case.
Repo pull will occur twice, but the second will have no changes, and this should be too rare to worry about.
"""
for filename in sorted(os.listdir(DOPULL_LOG_DIR)):
mode = os.stat(os.path.join(DOPULL_LOG_DIR, filename))[stat.ST_MODE]
Expand All @@ -55,21 +68,46 @@ def scan_dopull_log():
continue

ebook_num = 0
m = re.match(r'^(\d+)\.zip\.trig$', filename)
m = re.match(r'^(\d+)\.(zip\.trig|json)$', filename)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

before, we didn't have to worry about duplicate .zip.trig files. Now we need to decide what to do if there are trig and json files for the same book number.

Can we just assume that any trig file that is not empty contains json?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add a comment about the precedence?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Like this, in the function description?

    If both .zip.trig and .json files are present for the same ebook number:
    (Workflow creates a .json, file, then Errata Workbench creates a .zip.trig file)
    should be OK, the repo has all the changes, and we need the trigger file in any case.
    Repo pull will occur twice, but the second will have no changes, and this should be too rare to worry about.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to adjust for .zip.trig processed first, and already moved to DOPUSH_LOG_DIR.

if m:
ebook_num = int(m.group(1))
ebook_num = m.group(1)
if not ebook_num.isdigit():
logging.error(f'Skipping invalid filename (non-numeric book number): {filename}')
continue
logging.info(ebook_num)
origin = f'{UPSTREAM_REPO_DIR}{ebook_num}.git/'
target_path = os.path.join(FILES, str(ebook_num))
target_path = os.path.join(FILES, ebook_num)
logging.info(f'origin: {origin}, target_path: {target_path}')

if update_folder(origin, target_path):
shutil.move(os.path.join(DOPULL_LOG_DIR, filename),
os.path.join(DOPUSH_LOG_DIR, filename))
else:
logging.error(f'failed to update {ebook_num}')

# Get the latest files from the upstream repo
if not update_folder(origin, target_path):
logging.error(f'failed to get files for {ebook_num}')
continue

# Now trigger database/catalog update
try:
if filename.endswith('.json'):
# For .json files, move them to the JSON_LOG_DIR to add to the database
shutil.move(os.path.join(DOPULL_LOG_DIR, filename),
os.path.join(JSON_LOG_DIR, filename))
logging.info(f'moved {filename} to JSON log directory for processing.')

# Create a corresponding .zip.trig trigger file
trigger_file = os.path.join(DOPUSH_LOG_DIR, ebook_num + '.zip.trig')
if not os.path.exists(trigger_file):
with open(trigger_file, 'w') as file:
pass
else:
# Move file to the DOPUSH_LOG_DIR to trigger updating
trigger_push = os.path.join(DOPUSH_LOG_DIR, filename)
if not os.path.exists(trigger_push):
shutil.move(os.path.join(DOPULL_LOG_DIR, filename), trigger_push)
except Exception as e:
logging.error(f'failed to trigger update for {ebook_num}: {e}')

return


def main():
sys.exit(scan_dopull_log())

Expand Down