Skip to content

Commit 739d09d

Browse files
committed
Greatly simplified the copy to the staging directory during upload.
For symlinks, we just reproduce them exactly, regardless of whether they are local or not. If they're non-local links that end up being dangling after the copy... well too bad, the user shouldn't have created them. For copies, we don't bother creating hardlinks. This is safer as it avoids propagation of hardlinks into the registry, whereby users could then inadvertently modify our "immutable" files. Performance should not be majorly impacted as we were copying across partitions anyway. To offset any perf hit, the copy is now parallelized with parLapply. We also check that the copied files have the correct MD5 checksums and sizes, just to protect against filesystem corruption.
1 parent b3e901b commit 739d09d

File tree

5 files changed

+68
-47
lines changed

5 files changed

+68
-47
lines changed

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ Imports:
1212
methods,
1313
tools,
1414
jsonlite,
15+
digest,
1516
httr2
1617
Suggests:
1718
parallel,

NAMESPACE

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ export(validateVersion)
3333
export(versionPath)
3434
import(httr2)
3535
import(methods)
36+
importFrom(digest,digest)
3637
importFrom(jsonlite,fromJSON)
3738
importFrom(jsonlite,toJSON)
3839
importFrom(utils,URLencode)
3940
importFrom(utils,download.file)
40-
importFrom(utils,head)
4141
importFrom(utils,tail)

R/uploadDirectory.R

Lines changed: 34 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,15 @@
1717
#' @param spoof String containing the name of a user on whose behalf this request is being made.
1818
#' This should only be used if the Gobbler service allows spoofing by the current user.
1919
#' If \code{NULL}, no spoofing is performed.
20+
#' @param concurrent Integer specifying the number of concurrent downloads.
21+
#' Only used if files need to be copied from \code{directory} to \code{staging}.
2022
#' @inheritParams createProject
2123
#'
24+
#' @details
25+
#' If \code{directory} is not inside \code{staging}, a new staging subdirectory is allocated by \code{\link{allocateUploadDirectory}}.
26+
#' The contents of \code{directory} are then copied to the new subdirectory, preserving all symbolic links, dotfiles and empty directories.
27+
#' If \code{consume=NULL}, it is set to \code{TRUE} as the copied contents will no longer be used.
28+
#'
2229
#' @return On success, \code{NULL} is invisibly returned.
2330
#'
2431
#' @author Aaron Lun
@@ -47,7 +54,7 @@
4754
#' \code{\link{fetchManifest}}, to obtain the manifest of the versioned asset's contents.
4855
#'
4956
#' @export
50-
uploadDirectory <- function(project, asset, version, directory, staging, url, probation=FALSE, consume=NULL, ignore..=TRUE, spoof=NULL) {
57+
uploadDirectory <- function(project, asset, version, directory, staging, url, probation=FALSE, consume=NULL, ignore..=TRUE, spoof=NULL, concurrent=1L) {
5158
# Normalizing them so that they're comparable, in order to figure out whether 'directory' lies inside 'staging'.
5259
directory <- normalizePath(directory)
5360
staging <- normalizePath(staging)
@@ -65,6 +72,7 @@ uploadDirectory <- function(project, asset, version, directory, staging, url, pr
6572
if (!in.staging) {
6673
new.dir <- allocateUploadDirectory(staging)
6774
on.exit(unlink(new.dir, recursive=TRUE), add=TRUE, after=FALSE) # cleaning up after the request is done.
75+
to.copy <- character(0)
6876

6977
for (p in list.files(directory, recursive=TRUE, include.dirs=TRUE, all.files=TRUE)) {
7078
src <- file.path(directory, p)
@@ -76,19 +84,22 @@ uploadDirectory <- function(project, asset, version, directory, staging, url, pr
7684

7785
src.link <- Sys.readlink(src)
7886
if (src.link == "") {
79-
.link_or_copy(src, dest, p)
80-
81-
} else if (.is_absolute_or_local_link(src.link, p)) {
87+
to.copy <- c(to.copy, list(c(src, dest)))
88+
} else {
8289
if (!file.symlink(src.link, dest)) {
83-
stop("failed to create a symlink for '", p, "' in the staging directory")
90+
stop("failed to create a symlink '", dest, "' to '", src.link, "'")
8491
}
85-
86-
} else {
87-
full.src <- normalizePath(file.path(dirname(src), src.link))
88-
.link_or_copy(full.src, dest, p)
8992
}
9093
}
9194

95+
if (concurrent == 1L) {
96+
lapply(to.copy, copy_file)
97+
} else {
98+
cl <- parallel::makeCluster(concurrent)
99+
on.exit(parallel::stopCluster(cl), add=TRUE, after=FALSE)
100+
parallel::parLapply(cl, to.copy, copy_file)
101+
}
102+
92103
directory <- new.dir
93104
}
94105

@@ -112,37 +123,23 @@ uploadDirectory <- function(project, asset, version, directory, staging, url, pr
112123
invisible(NULL)
113124
}
114125

115-
#' @importFrom utils head
116-
.is_absolute_or_local_link <- function(target, link.path) {
117-
# Assuming Unix-style file paths, who uses a Windows HPC anyway?
118-
if (startsWith(target, "/")) {
119-
return(TRUE)
126+
#' @importFrom digest digest
127+
copy_file <- function(job) {
128+
src <- job[[1]]
129+
dest <- job[[2]]
130+
if (!file.copy(src, dest)) {
131+
stop("failed to copy '", src, "' to '", dest, "'")
120132
}
121133

122-
# Both 'target' and 'link.path' should be relative at this point, so the
123-
# idea is to check whether 'file.path(dirname(link.path), target)' is still
124-
# a child of 'dirname(link.path)'.
125-
pre.length <- length(strsplit(link.path, "/")[[1]]) - 1L
126-
post.fragments <- head(strsplit(target, "/")[[1]], -1L)
127-
128-
for (x in post.fragments) {
129-
if (x == ".") {
130-
next
131-
} else if (x == "..") {
132-
pre.length <- pre.length - 1L
133-
if (pre.length < 0L) {
134-
return(FALSE)
135-
}
136-
} else {
137-
pre.length <- pre.length + 1L
138-
}
134+
old.size <- file.info(src)$size
135+
new.size <- file.info(dest)$size
136+
if (old.size != new.size) {
137+
stop("mismatching sizes for '", src, "' and '", dest, "'")
139138
}
140139

141-
TRUE
142-
}
143-
144-
.link_or_copy <- function(src, dest, p) {
145-
if (!suppressWarnings(file.link(src, dest)) && !file.copy(src, dest)) {
146-
stop("failed to link or copy '", p, "' to the staging directory")
140+
old.md5 <- digest(file=src)
141+
new.md5 <- digest(file=dest)
142+
if (old.md5 != new.md5) {
143+
stop("mismatching MD5 checksums for '", src, "' and '", dest, "'")
147144
}
148145
}

man/uploadDirectory.Rd

Lines changed: 15 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

tests/testthat/test-upload.R

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,21 @@ test_that("upload works as expected for regular files", {
4545
expect_true(all(vapply(man, function(x) !is.null(x$link), TRUE)))
4646
})
4747

48+
test_that("upload works as expected with parallel copying", {
49+
uploadDirectory(
50+
project="test-upload",
51+
asset="penelope",
52+
version="1",
53+
directory=tmp,
54+
staging=info$staging,
55+
url=info$url,
56+
concurrent=2
57+
)
58+
59+
man <- fetchManifest("test-upload", "penelope", "1", registry=info$registry)
60+
expect_identical(sort(names(man)), c("blah.txt", "foo/bar.txt"))
61+
})
62+
4863
test_that("upload works as expected with empty directories", {
4964
tmp <- tempfile()
5065
dir.create(tmp)
@@ -91,14 +106,10 @@ test_that("upload works as expected for relative links", {
91106
dest <- tempfile()
92107
dir.create(dest)
93108
write(file=file.path(dest, "blah.txt"), letters)
94-
file.symlink("blah.txt", file.path(dest, "whee.txt")) # relative links within the directory are preserved.
109+
file.symlink("blah.txt", file.path(dest, "whee.txt"))
95110
dir.create(file.path(dest, "foo"))
96111
file.symlink("../whee.txt", file.path(dest, "foo/bar.txt"))
97112

98-
outside <- tempfile(tmpdir=dirname(dest))
99-
write(file=outside, "FOOBLEWOOBLE")
100-
file.symlink(file.path("../../", basename(outside)), file.path(dest, "foo/outer.txt")) # relative links outside the directory are lost.
101-
102113
uploadDirectory(
103114
project="test-more-upload",
104115
asset="nicole",
@@ -109,12 +120,10 @@ test_that("upload works as expected for relative links", {
109120
)
110121

111122
man <- fetchManifest("test-more-upload", "nicole", "1", registry=info$registry)
112-
expect_identical(sort(names(man)), c("blah.txt", "foo/bar.txt", "foo/outer.txt", "whee.txt"))
123+
expect_identical(sort(names(man)), c("blah.txt", "foo/bar.txt", "whee.txt"))
113124
expect_false(is.null(man[["whee.txt"]]$link))
114-
expect_null(man[["foo/outer.txt"]]$link)
115125
expect_false(is.null(man[["foo/bar.txt"]]$link))
116126
expect_null(man[["blah.txt"]]$link)
117-
expect_identical(13L, man[["foo/outer.txt"]]$size)
118127
expect_identical(man[["whee.txt"]]$size, man[["foo/bar.txt"]]$size)
119128
expect_identical(man[["whee.txt"]]$size, man[["blah.txt"]]$size)
120129
})

0 commit comments

Comments
 (0)