release(v1.7.15-alpha): bulletproof downloads — resume, retry, real progress

download_update
  Each component download is now resumable via HTTP Range requests
  (Range: bytes=N-) and retried up to 6 times with exponential
  backoff (5/15/30/60/120/180s). On a dropped connection the next
  attempt picks up at the last written byte offset instead of
  restarting at zero. Streams via reqwest::Response::chunk() to the
  staging file so a 160 MB frontend tarball doesn't sit in RAM. SHA
  is verified over the complete file at the end of each component;
  mismatch nukes the staged file and restarts from scratch.

Real download progress counters
  New AtomicU64 globals DOWNLOAD_BYTES/DOWNLOAD_TOTAL are updated
  from the chunk loop. update.status exposes them as
  download_progress.{bytes_downloaded, total_bytes, active}. The
  SystemUpdate.vue progress bar now polls update.status every
  second instead of incrementing a fake random counter — and
  crucially, if the user navigates away and back, the component
  picks up the in-progress download from the backend atomics
  immediately.

Update-check retries
  handle_update_check now retries the manifest fetch up to 3 times
  with a 5s gap if the first try hits a transport error, so a
  momentary gitea hiccup doesn't make a node report "up to date"
  when there actually is a new release. Tight 10s connect timeout
  per attempt keeps the total bounded.

Artefacts:
  archipelago                                      1070c87f…c081c162b  40584792
  archipelago-frontend-1.7.15-alpha.tar.gz         8e630eba…63fd43f   162078068

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian
2026-04-20 17:17:58 -04:00
parent be8e5ee46b
commit 749234b8b0
5 changed files with 295 additions and 76 deletions

2
core/Cargo.lock generated
View File

@@ -80,7 +80,7 @@ checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61"
[[package]]
name = "archipelago"
version = "1.7.14-alpha"
version = "1.7.15-alpha"
dependencies = [
"anyhow",
"archipelago-container",

View File

@@ -1,6 +1,6 @@
[package]
name = "archipelago"
version = "1.7.14-alpha"
version = "1.7.15-alpha"
edition = "2021"
description = "Archipelago Bitcoin Node OS - Native backend"
authors = ["Archipelago Team"]

View File

@@ -157,6 +157,16 @@ impl RpcHandler {
/// Get update status without checking remote.
pub(super) async fn handle_update_status(&self) -> Result<serde_json::Value> {
let state = update::get_status(&self.config.data_dir).await?;
// Expose live download progress so the UI can resume the
// progress bar after navigation instead of showing the fake
// creep again. An RPC poll every ~1s during download drives a
// real progress indicator that survives route changes.
let downloaded = update::DOWNLOAD_BYTES
.load(std::sync::atomic::Ordering::Relaxed);
let total = update::DOWNLOAD_TOTAL
.load(std::sync::atomic::Ordering::Relaxed);
let active = total > 0 && downloaded < total;
let completed = total > 0 && downloaded >= total;
Ok(serde_json::json!({
"current_version": state.current_version,
@@ -164,6 +174,13 @@ impl RpcHandler {
"update_available": state.available_update.is_some(),
"update_in_progress": state.update_in_progress,
"rollback_available": state.rollback_available,
"download_progress": if active || completed {
Some(serde_json::json!({
"bytes_downloaded": downloaded,
"total_bytes": total,
"active": active,
}))
} else { None },
}))
}

View File

@@ -4,9 +4,17 @@ use anyhow::{Context, Result};
use chrono::Timelike;
use serde::{Deserialize, Serialize};
use std::path::Path;
use std::sync::atomic::{AtomicU64, Ordering};
use tokio::fs;
use tracing::{debug, info};
/// Live download progress counters. Updated by download_component_resumable
/// as bytes arrive and read by the update.status RPC so the UI can show
/// a real progress bar instead of a fake creep. Global because the
/// download runs in one place at a time; no need for per-handler state.
pub static DOWNLOAD_BYTES: AtomicU64 = AtomicU64::new(0);
pub static DOWNLOAD_TOTAL: AtomicU64 = AtomicU64::new(0);
const DEFAULT_UPDATE_MANIFEST_URL: &str =
"https://git.tx1138.com/lfg2025/archy/raw/branch/main/releases/manifest.json";
const UPDATE_STATE_FILE: &str = "update_state.json";
@@ -111,36 +119,57 @@ pub async fn check_for_updates(data_dir: &Path) -> Result<UpdateState> {
let mut state = load_state(data_dir).await?;
info!("Checking for updates...");
// 45s total budget, and we retry up to 3 times so a momentary
// gitea hiccup doesn't make the node report "up to date" when an
// update actually exists. Short per-attempt timeout keeps the RPC
// responsive in the common case.
let client = reqwest::Client::builder()
.timeout(std::time::Duration::from_secs(15))
.connect_timeout(std::time::Duration::from_secs(10))
.build()
.context("Failed to create HTTP client")?;
let manifest_url = update_manifest_url();
match client.get(&manifest_url).send().await {
Ok(resp) if resp.status().is_success() => {
let manifest: UpdateManifest = resp
.json()
.await
.context("Failed to parse update manifest")?;
if manifest.version != state.current_version {
info!(
current = %state.current_version,
available = %manifest.version,
"Update available"
);
state.available_update = Some(manifest);
} else {
debug!("Already on latest version: {}", state.current_version);
state.available_update = None;
let mut last_err: Option<String> = None;
let mut handled = false;
for attempt in 1..=3u8 {
if attempt > 1 {
tokio::time::sleep(std::time::Duration::from_secs(5)).await;
}
match client.get(&manifest_url).send().await {
Ok(resp) if resp.status().is_success() => {
match resp.json::<UpdateManifest>().await {
Ok(manifest) => {
if manifest.version != state.current_version {
info!(
current = %state.current_version,
available = %manifest.version,
"Update available"
);
state.available_update = Some(manifest);
} else {
debug!("Already on latest version: {}", state.current_version);
state.available_update = None;
}
handled = true;
break;
}
Err(e) => {
last_err = Some(format!("parse: {}", e));
}
}
}
Ok(resp) => {
last_err = Some(format!("HTTP {}", resp.status()));
}
Err(e) => {
last_err = Some(e.to_string());
}
}
Ok(resp) => {
debug!("Update check returned status: {}", resp.status());
}
Err(e) => {
debug!("Update check failed (offline?): {}", e);
}
if !handled {
if let Some(e) = last_err {
debug!("Update check failed after retries: {}", e);
}
}
@@ -163,6 +192,14 @@ pub async fn dismiss_update(data_dir: &Path) -> Result<()> {
/// Download update components to a staging directory.
/// Verifies SHA256 hash for each component.
///
/// Robustness: each component download is **resumable** via HTTP Range
/// requests and retried up to 6 times with exponential backoff. When
/// gitea drops the connection mid-stream (happens regularly at slow
/// raw-file throughput), the next attempt picks up where the previous
/// one left off instead of restarting from byte zero. SHA256 is
/// verified over the complete file at the end of each component, so a
/// partially-corrupt resume still fails cleanly.
pub async fn download_update(data_dir: &Path) -> Result<DownloadProgress> {
let state = load_state(data_dir).await?;
let manifest = state
@@ -176,11 +213,8 @@ pub async fn download_update(data_dir: &Path) -> Result<DownloadProgress> {
.context("Failed to create staging dir")?;
let client = reqwest::Client::builder()
// 1h per component — the bundled frontend+aiui tarball sits at
// ~160 MB and git.tx1138.com raw serves at ~70 KB/s which puts
// the worst case above the old 30 min cap. A larger timeout
// with a tight connect_timeout keeps hung connections from
// swallowing the whole budget.
// Per-request budget; each attempt gets the full hour. A retry
// restarts the budget cleanly.
.timeout(std::time::Duration::from_secs(3600))
.connect_timeout(std::time::Duration::from_secs(30))
.build()
@@ -189,49 +223,20 @@ pub async fn download_update(data_dir: &Path) -> Result<DownloadProgress> {
let mut downloaded = 0u64;
let total_bytes: u64 = manifest.components.iter().map(|c| c.size_bytes).sum();
// Seed the live counters so polls during the handshake show the
// right denominator immediately instead of 0/0 → NaN%.
DOWNLOAD_TOTAL.store(total_bytes, Ordering::Relaxed);
DOWNLOAD_BYTES.store(0, Ordering::Relaxed);
for component in &manifest.components {
info!(name = %component.name, url = %component.download_url, "Downloading component");
let resp = client
.get(&component.download_url)
.send()
.await
.with_context(|| format!("Failed to download {}", component.name))?;
if !resp.status().is_success() {
anyhow::bail!(
"Download failed for {}: HTTP {}",
component.name,
resp.status()
);
}
let bytes = resp
.bytes()
.await
.with_context(|| format!("Failed to read {}", component.name))?;
// Verify SHA256
use sha2::{Digest, Sha256};
let hash = hex::encode(Sha256::digest(&bytes));
if hash != component.sha256 {
anyhow::bail!(
"SHA256 mismatch for {}: expected {}, got {}",
component.name,
component.sha256,
hash
);
}
let dest = staging_dir.join(&component.name);
fs::write(&dest, &bytes)
.await
.with_context(|| format!("Failed to write {}", component.name))?;
download_component_resumable(&client, component, &dest, downloaded).await?;
downloaded += component.size_bytes;
DOWNLOAD_BYTES.store(downloaded, Ordering::Relaxed);
info!(
name = %component.name,
bytes = bytes.len(),
bytes = component.size_bytes,
"Component downloaded and verified"
);
}
@@ -249,6 +254,166 @@ pub async fn download_update(data_dir: &Path) -> Result<DownloadProgress> {
})
}
/// Download a single component to `dest`, resuming from the end of
/// any existing partial file via a Range request. Retries up to 6
/// times with exponential backoff (5s, 15s, 30s, 60s, 120s, 180s).
/// Verifies the SHA256 over the full file at the end.
async fn download_component_resumable(
client: &reqwest::Client,
component: &ComponentUpdate,
dest: &Path,
prior_total: u64,
) -> Result<()> {
use sha2::{Digest, Sha256};
use tokio::io::AsyncWriteExt;
const MAX_ATTEMPTS: u32 = 6;
const BACKOFFS: [u64; 5] = [5, 15, 30, 60, 120];
let mut last_err: Option<anyhow::Error> = None;
for attempt in 1..=MAX_ATTEMPTS {
let existing_len = match tokio::fs::metadata(dest).await {
Ok(m) => m.len(),
Err(_) => 0,
};
if existing_len >= component.size_bytes {
// File is already complete — break out and go verify.
break;
}
if attempt > 1 {
let delay = BACKOFFS[(attempt as usize - 2).min(BACKOFFS.len() - 1)];
tracing::warn!(
name = %component.name,
attempt,
resume_at = existing_len,
"Retrying download in {}s (previous error: {})",
delay,
last_err.as_ref().map(|e| e.to_string()).unwrap_or_default()
);
tokio::time::sleep(std::time::Duration::from_secs(delay)).await;
}
let mut req = client.get(&component.download_url);
if existing_len > 0 {
req = req.header("Range", format!("bytes={}-", existing_len));
}
let resp = match req.send().await {
Ok(r) => r,
Err(e) => {
last_err = Some(anyhow::anyhow!(e));
continue;
}
};
let status = resp.status();
// 200 OK on a fresh start, 206 Partial Content on a resume
// that the server honoured. Anything else is a problem.
let is_resume = existing_len > 0 && status == reqwest::StatusCode::PARTIAL_CONTENT;
let is_fresh = existing_len == 0 && status.is_success();
let server_ignored_range = existing_len > 0 && status == reqwest::StatusCode::OK;
if !is_resume && !is_fresh && !server_ignored_range {
last_err = Some(anyhow::anyhow!(
"HTTP {} for {} (resume offset {})",
status,
component.name,
existing_len
));
continue;
}
// If the server ignored Range (returned 200 with the full
// body), wipe the partial file and start over.
let mut file = if server_ignored_range {
let _ = tokio::fs::remove_file(dest).await;
tokio::fs::OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(dest)
.await
.context("open staging file")?
} else if is_resume {
tokio::fs::OpenOptions::new()
.append(true)
.open(dest)
.await
.context("open staging file for append")?
} else {
tokio::fs::OpenOptions::new()
.create(true)
.write(true)
.truncate(true)
.open(dest)
.await
.context("open staging file")?
};
let mut resp = resp;
let mut stream_err = false;
let mut on_disk = existing_len;
loop {
match resp.chunk().await {
Ok(Some(bytes)) => {
if let Err(e) = file.write_all(&bytes).await {
last_err = Some(anyhow::anyhow!(e).context("writing chunk"));
stream_err = true;
break;
}
on_disk += bytes.len() as u64;
DOWNLOAD_BYTES.store(
prior_total + on_disk.min(component.size_bytes),
Ordering::Relaxed,
);
}
Ok(None) => break, // stream ended cleanly
Err(e) => {
last_err = Some(anyhow::anyhow!(e).context("reading chunk"));
stream_err = true;
break;
}
}
}
let _ = file.flush().await;
let _ = file.sync_all().await;
drop(file);
if stream_err {
continue;
}
// Stream ended cleanly. If we've got the expected size, verify
// the SHA and succeed. Otherwise loop to resume from the new
// offset on the next attempt.
let final_len = tokio::fs::metadata(dest)
.await
.map(|m| m.len())
.unwrap_or(0);
if final_len < component.size_bytes {
last_err = Some(anyhow::anyhow!(
"download truncated: got {} of {} bytes",
final_len,
component.size_bytes
));
continue;
}
// Full file — verify hash.
let bytes = tokio::fs::read(dest)
.await
.context("read staging file for hash check")?;
let hash = hex::encode(Sha256::digest(&bytes));
if hash == component.sha256 {
return Ok(());
}
// SHA mismatch — the file on disk is garbage. Nuke it and
// start over from scratch on the next attempt.
let _ = tokio::fs::remove_file(dest).await;
last_err = Some(anyhow::anyhow!(
"SHA256 mismatch for {}: expected {}, got {}",
component.name,
component.sha256,
hash
));
}
Err(last_err.unwrap_or_else(|| anyhow::anyhow!("download failed without a captured error")))
}
/// Run a command as root, but *outside* the archipelago service's
/// restricted mount namespace.
///

View File

@@ -324,6 +324,29 @@ const statusMessage = ref('')
const statusIsError = ref(false)
const downloadPercent = ref(0)
const downloadPercentFormatted = computed(() => downloadPercent.value.toFixed(2))
// Poll the backend for the real bytes_downloaded / total_bytes so the
// progress bar tracks actual download state (and survives route
// changes). Returns true if a download is currently in progress.
async function pollDownloadProgress(): Promise<boolean> {
try {
const res = await rpcClient.call<{
download_progress?: {
bytes_downloaded: number
total_bytes: number
active: boolean
} | null
}>({ method: 'update.status' })
const p = res.download_progress
if (p && p.total_bytes > 0) {
downloadPercent.value = Math.min(100, (p.bytes_downloaded / p.total_bytes) * 100)
return p.active
}
return false
} catch {
return false
}
}
// Shown next to the progress bar when the fake increment has maxed out
// at 95% but the real RPC hasn't returned yet — lets the user know the
// UI hasn't frozen while SHA verification and disk writes finish.
@@ -486,14 +509,12 @@ async function downloadUpdate() {
downloadPercent.value = 0
statusMessage.value = ''
// Simulate incremental progress while waiting for the RPC. Capped at
// 95% so the bar never shows >100% before the real completion jumps it
// to 100 — previously the random increment could overshoot.
const progressInterval = setInterval(() => {
if (downloadPercent.value < 95) {
downloadPercent.value = Math.min(95, downloadPercent.value + Math.random() * 3)
}
}, 500)
// Poll the backend's real byte counter every second instead of
// faking progress. The backend exposes bytes_downloaded/total_bytes
// via update.status, updated per chunk. This also means the bar
// resumes correctly after navigating away and back — no more
// "95% for some time" mystery.
const progressInterval = setInterval(() => { void pollDownloadProgress() }, 1000)
try {
const res = await rpcClient.call<{
@@ -616,8 +637,24 @@ async function setSchedule(value: ScheduleValue) {
}
}
onMounted(() => {
Promise.all([loadStatus(), loadSchedule(), checkForUpdates()])
onMounted(async () => {
await Promise.all([loadStatus(), loadSchedule(), checkForUpdates()])
// If a download was already running when the user navigated here
// (or refreshed), pick up the progress bar where it is and keep
// polling until the backend reports done. No RPC call to start the
// download — the backend's already running it.
const active = await pollDownloadProgress()
if (active) {
downloading.value = true
const resumeInterval = setInterval(async () => {
const stillActive = await pollDownloadProgress()
if (!stillActive) {
clearInterval(resumeInterval)
downloading.value = false
downloaded.value = true
}
}, 1000)
}
})
</script>