stuebinm
a00e28d85a
Pluto [1] is one of these interactive notebook thingies that have become so unreasonably popular with people doing machine learning or data analysis, but – somewhat surprisingly – it's actually not shit (e.g. no global mutable state in the notebook, no weird unreadable fileformat that doesn't play well with version control, etc.) In particular, it can be used collaboratively (while it doesn't do real-time collaborative editing like a pad, it /does/ push out global updates each time someone executes a cell, so it's reasonably close), and I think it may be useful to have for julia-hacking sessions. It may also be useful for people running low-end laptops, since code is executed on the host — and I guess hainich has enough unused ressources lying around that we can spare a few. After deploying this, the notebook server should be reachable via: ssh hainich -L 9999:localhost:9999 and then visiting http://localhost:9999 Caveats: by design, pluto allows a user to execute arbitrary code on the host. That is its main function, and not something we can prevent. I've tried to mitigate this as far as possible by: - only allowing access via ssh port forwarding. In theory pluto does have basic access control, but that works via a secret link that it'll spit to stdout on startup (i.e. the journal), which cannot be set in advance, nor regenerted without restarting the entire process. Unfortunately, this means we won't be able to use it at e.g. conference sessions with people who don't have access to our infra - running it in a nixos-container as its own user, so it should never get any kind of access to the "main" directory tree apart from a single directory that we can keep notebooks in (which is currently a bind mount set to /data/pluto) - limiting memory and cpu for that container via systemd (less out of worry for exploits, and more so that a few accidental while-true loops will never consume enough cpu time to noticebly slow down anything else). The current limits for both a chosen relatively low; we'll have to see if they become too limiting should anyone run an actual weather model on this. Things we could also do: - currently, the container does not have its own network (mostly since that would make it slightly less convenient to use with port forwarding); in theory, pluto should even be able to run entirely without internet access of its own, but I'm not sure if this would break things like loading images / raw data into a notebook - make the container ephemeral, and only keep the directory containing the notebooks. I haven't done this since it would require recompilation of pluto each time the container is wiped, which makes for a potentially inconvenient startup time (though still < 3-5 mins) Questions: - have I missed anything important that should definitely be also sandboxed / limited in some way? - in general, are we comfortable running something like this? - would we (in principle) be comfortable opening this up to other people for congress sessions (assuming we figure out a reasonable access control)? Notes to deployer: - while I have not tested this on hainich, it works on my own server - you will probably have to create the /data/pluto directory for the bind mount, and make it world-writable (or chown it to the pluto user inside the container) [1] https://github.com/fonsp/Pluto.jl/
459 lines
12 KiB
Bash
Executable file
459 lines
12 KiB
Bash
Executable file
#! /usr/bin/env bash
|
|
|
|
set -e -o pipefail
|
|
|
|
url=
|
|
rev=
|
|
expHash=
|
|
hashType=$NIX_HASH_ALGO
|
|
deepClone=$NIX_PREFETCH_GIT_DEEP_CLONE
|
|
leaveDotGit=$NIX_PREFETCH_GIT_LEAVE_DOT_GIT
|
|
fetchSubmodules=
|
|
builder=
|
|
branchName=$NIX_PREFETCH_GIT_BRANCH_NAME
|
|
|
|
# ENV params
|
|
out=${out:-}
|
|
http_proxy=${http_proxy:-}
|
|
|
|
# populated by clone_user_rev()
|
|
fullRev=
|
|
humanReadableRev=
|
|
commitDate=
|
|
commitDateStrict8601=
|
|
|
|
if test -n "$deepClone"; then
|
|
deepClone=true
|
|
else
|
|
deepClone=
|
|
fi
|
|
|
|
if test "$leaveDotGit" != 1; then
|
|
leaveDotGit=
|
|
else
|
|
leaveDotGit=true
|
|
fi
|
|
|
|
usage(){
|
|
echo >&2 "syntax: nix-prefetch-git [options] [URL [REVISION [EXPECTED-HASH]]]
|
|
|
|
Options:
|
|
--out path Path where the output would be stored.
|
|
--url url Any url understood by 'git clone'.
|
|
--rev ref Any sha1 or references (such as refs/heads/master)
|
|
--hash h Expected hash.
|
|
--branch-name Branch name to check out into
|
|
--deepClone Clone the entire repository.
|
|
--no-deepClone Make a shallow clone of just the required ref.
|
|
--leave-dotGit Keep the .git directories.
|
|
--fetch-submodules Fetch submodules.
|
|
--builder Clone as fetchgit does, but url, rev, and out option are mandatory.
|
|
--quiet Only print the final json summary.
|
|
"
|
|
exit 1
|
|
}
|
|
|
|
# some git commands print to stdout, which would contaminate our JSON output
|
|
clean_git(){
|
|
git "$@" >&2
|
|
}
|
|
|
|
argi=0
|
|
argfun=""
|
|
for arg; do
|
|
if test -z "$argfun"; then
|
|
case $arg in
|
|
--out) argfun=set_out;;
|
|
--url) argfun=set_url;;
|
|
--rev) argfun=set_rev;;
|
|
--hash) argfun=set_hashType;;
|
|
--branch-name) argfun=set_branchName;;
|
|
--deepClone) deepClone=true;;
|
|
--quiet) QUIET=true;;
|
|
--no-deepClone) deepClone=;;
|
|
--leave-dotGit) leaveDotGit=true;;
|
|
--fetch-submodules) fetchSubmodules=true;;
|
|
--builder) builder=true;;
|
|
-h|--help) usage; exit;;
|
|
*)
|
|
: $((++argi))
|
|
case $argi in
|
|
1) url=$arg;;
|
|
2) rev=$arg;;
|
|
3) expHash=$arg;;
|
|
*) exit 1;;
|
|
esac
|
|
;;
|
|
esac
|
|
else
|
|
case $argfun in
|
|
set_*)
|
|
var=${argfun#set_}
|
|
eval $var=$arg
|
|
;;
|
|
esac
|
|
argfun=""
|
|
fi
|
|
done
|
|
|
|
if test -z "$url"; then
|
|
usage
|
|
fi
|
|
|
|
|
|
init_remote(){
|
|
local url=$1
|
|
clean_git init
|
|
clean_git remote add origin "$url"
|
|
( [ -n "$http_proxy" ] && clean_git config http.proxy "$http_proxy" ) || true
|
|
}
|
|
|
|
# Return the reference of an hash if it exists on the remote repository.
|
|
ref_from_hash(){
|
|
local hash=$1
|
|
git ls-remote origin | sed -n "\,$hash\t, { s,\(.*\)\t\(.*\),\2,; p; q}"
|
|
}
|
|
|
|
# Return the hash of a reference if it exists on the remote repository.
|
|
hash_from_ref(){
|
|
local ref=$1
|
|
git ls-remote origin | sed -n "\,\t$ref, { s,\(.*\)\t\(.*\),\1,; p; q}"
|
|
}
|
|
|
|
# Returns a name based on the url and reference
|
|
#
|
|
# This function needs to be in sync with nix's fetchgit implementation
|
|
# of urlToName() to re-use the same nix store paths.
|
|
url_to_name(){
|
|
local url=$1
|
|
local ref=$2
|
|
local base
|
|
base=$(basename "$url" .git | cut -d: -f2)
|
|
|
|
if [[ $ref =~ ^[a-z0-9]+$ ]]; then
|
|
echo "$base-${ref:0:7}"
|
|
else
|
|
echo "$base"
|
|
fi
|
|
}
|
|
|
|
# Fetch everything and checkout the right sha1
|
|
checkout_hash(){
|
|
local hash="$1"
|
|
local ref="$2"
|
|
|
|
if test -z "$hash"; then
|
|
hash=$(hash_from_ref "$ref")
|
|
fi
|
|
|
|
clean_git fetch -t ${builder:+--progress} origin || return 1
|
|
|
|
local object_type=$(git cat-file -t "$hash")
|
|
if [[ "$object_type" == "commit" ]]; then
|
|
clean_git checkout -b "$branchName" "$hash" || return 1
|
|
elif [[ "$object_type" == "tree" ]]; then
|
|
clean_git config user.email "nix-prefetch-git@localhost"
|
|
clean_git config user.name "nix-prefetch-git"
|
|
commit_id=$(git commit-tree "$hash" -m "Commit created from tree hash $hash")
|
|
clean_git checkout -b "$branchName" "$commit_id" || return 1
|
|
else
|
|
echo "Unrecognized git object type: $object_type"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Fetch only a branch/tag and checkout it.
|
|
checkout_ref(){
|
|
local hash="$1"
|
|
local ref="$2"
|
|
|
|
if [[ -n "$deepClone" ]]; then
|
|
# The caller explicitly asked for a deep clone. Deep clones
|
|
# allow "git describe" and similar tools to work. See
|
|
# https://marc.info/?l=nix-dev&m=139641582514772
|
|
# for a discussion.
|
|
return 1
|
|
fi
|
|
|
|
if test -z "$ref"; then
|
|
ref=$(ref_from_hash "$hash")
|
|
fi
|
|
|
|
if test -n "$ref"; then
|
|
# --depth option is ignored on http repository.
|
|
clean_git fetch ${builder:+--progress} --depth 1 origin +"$ref" || return 1
|
|
clean_git checkout -b "$branchName" FETCH_HEAD || return 1
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Update submodules
|
|
init_submodules(){
|
|
# Add urls into .git/config file
|
|
clean_git submodule init
|
|
|
|
# list submodule directories and their hashes
|
|
git submodule status |
|
|
while read -r l; do
|
|
local hash
|
|
local dir
|
|
local name
|
|
local url
|
|
|
|
# checkout each submodule
|
|
hash=$(echo "$l" | awk '{print $1}' | tr -d '-')
|
|
dir=$(echo "$l" | sed -n 's/^.[0-9a-f]\+ \(.*[^)]*\)\( (.*)\)\?$/\1/p')
|
|
name=$(
|
|
git config -f .gitmodules --get-regexp submodule\..*\.path |
|
|
sed -n "s,^\(.*\)\.path $dir\$,\\1,p")
|
|
url=$(git config --get "${name}.url")
|
|
|
|
clone "$dir" "$url" "$hash" ""
|
|
done
|
|
}
|
|
|
|
clone(){
|
|
local top=$PWD
|
|
local dir="$1"
|
|
local url="$2"
|
|
local hash="$3"
|
|
local ref="$4"
|
|
|
|
cd "$dir"
|
|
|
|
# Initialize the repository.
|
|
init_remote "$url"
|
|
|
|
# Download data from the repository.
|
|
checkout_ref "$hash" "$ref" ||
|
|
checkout_hash "$hash" "$ref" || (
|
|
echo 1>&2 "Unable to checkout $hash$ref from $url."
|
|
exit 1
|
|
)
|
|
|
|
# Checkout linked sources.
|
|
if test -n "$fetchSubmodules"; then
|
|
init_submodules
|
|
fi
|
|
|
|
if [ -z "$builder" ] && [ -f .topdeps ]; then
|
|
if tg help &>/dev/null; then
|
|
echo "populating TopGit branches..."
|
|
tg remote --populate origin
|
|
else
|
|
echo "WARNING: would populate TopGit branches but TopGit is not available" >&2
|
|
echo "WARNING: install TopGit to fix the problem" >&2
|
|
fi
|
|
fi
|
|
|
|
cd "$top"
|
|
}
|
|
|
|
# Remove all remote branches, remove tags not reachable from HEAD, do a full
|
|
# repack and then garbage collect unreferenced objects.
|
|
make_deterministic_repo(){
|
|
local repo="$1"
|
|
|
|
# run in sub-shell to not touch current working directory
|
|
(
|
|
cd "$repo"
|
|
# Remove files that contain timestamps or otherwise have non-deterministic
|
|
# properties.
|
|
rm -rf .git/logs/ .git/hooks/ .git/index .git/FETCH_HEAD .git/ORIG_HEAD \
|
|
.git/refs/remotes/origin/HEAD .git/config
|
|
|
|
# Remove all remote branches.
|
|
git branch -r | while read -r branch; do
|
|
clean_git branch -rD "$branch"
|
|
done
|
|
|
|
# Remove tags not reachable from HEAD. If we're exactly on a tag, don't
|
|
# delete it.
|
|
maybe_tag=$(git tag --points-at HEAD)
|
|
git tag --contains HEAD | while read -r tag; do
|
|
if [ "$tag" != "$maybe_tag" ]; then
|
|
clean_git tag -d "$tag"
|
|
fi
|
|
done
|
|
|
|
# Do a full repack. Must run single-threaded, or else we lose determinism.
|
|
clean_git config pack.threads 1
|
|
clean_git repack -A -d -f
|
|
rm -f .git/config
|
|
|
|
# Garbage collect unreferenced objects.
|
|
# Note: --keep-largest-pack prevents non-deterministic ordering of packs
|
|
# listed in .git/objects/info/packs by only using a single pack
|
|
clean_git gc --prune=all --keep-largest-pack
|
|
)
|
|
}
|
|
|
|
|
|
clone_user_rev() {
|
|
local dir="$1"
|
|
local url="$2"
|
|
local rev="${3:-HEAD}"
|
|
|
|
# Perform the checkout.
|
|
case "$rev" in
|
|
HEAD|refs/*)
|
|
clone "$dir" "$url" "" "$rev" 1>&2;;
|
|
*)
|
|
if test -z "$(echo "$rev" | tr -d 0123456789abcdef)"; then
|
|
clone "$dir" "$url" "$rev" "" 1>&2
|
|
else
|
|
# if revision is not hexadecimal it might be a tag
|
|
clone "$dir" "$url" "" "refs/tags/$rev" 1>&2
|
|
fi;;
|
|
esac
|
|
|
|
pushd "$dir" >/dev/null
|
|
fullRev=$( (git rev-parse "$rev" 2>/dev/null || git rev-parse "refs/heads/$branchName") | tail -n1)
|
|
humanReadableRev=$(git describe "$fullRev" 2> /dev/null || git describe --tags "$fullRev" 2> /dev/null || echo -- none --)
|
|
commitDate=$(git show -1 --no-patch --pretty=%ci "$fullRev")
|
|
commitDateStrict8601=$(git show -1 --no-patch --pretty=%cI "$fullRev")
|
|
popd >/dev/null
|
|
|
|
# Allow doing additional processing before .git removal
|
|
eval "$NIX_PREFETCH_GIT_CHECKOUT_HOOK"
|
|
if test -z "$leaveDotGit"; then
|
|
echo "removing \`.git'..." >&2
|
|
find "$dir" -name .git -print0 | xargs -0 rm -rf
|
|
else
|
|
find "$dir" -name .git | while read -r gitdir; do
|
|
make_deterministic_repo "$(readlink -f "$gitdir/..")"
|
|
done
|
|
fi
|
|
}
|
|
|
|
exit_handlers=()
|
|
|
|
run_exit_handlers() {
|
|
exit_status=$?
|
|
for handler in "${exit_handlers[@]}"; do
|
|
eval "$handler $exit_status"
|
|
done
|
|
}
|
|
|
|
trap run_exit_handlers EXIT
|
|
|
|
quiet_exit_handler() {
|
|
exec 2>&3 3>&-
|
|
if [ $1 -ne 0 ]; then
|
|
cat "$errfile" >&2
|
|
fi
|
|
rm -f "$errfile"
|
|
}
|
|
|
|
quiet_mode() {
|
|
errfile="$(mktemp "${TMPDIR:-/tmp}/git-checkout-err-XXXXXXXX")"
|
|
exit_handlers+=(quiet_exit_handler)
|
|
exec 3>&2 2>"$errfile"
|
|
}
|
|
|
|
json_escape() {
|
|
local s="$1"
|
|
s="${s//\\/\\\\}" # \
|
|
s="${s//\"/\\\"}" # "
|
|
s="${s//^H/\\\b}" # \b (backspace)
|
|
s="${s//^L/\\\f}" # \f (form feed)
|
|
s="${s//
|
|
/\\\n}" # \n (newline)
|
|
s="${s//^M/\\\r}" # \r (carriage return)
|
|
s="${s// /\\t}" # \t (tab)
|
|
echo "$s"
|
|
}
|
|
|
|
print_results() {
|
|
hash="$1"
|
|
if ! test -n "$QUIET"; then
|
|
echo "" >&2
|
|
echo "git revision is $fullRev" >&2
|
|
if test -n "$finalPath"; then
|
|
echo "path is $finalPath" >&2
|
|
fi
|
|
echo "git human-readable version is $humanReadableRev" >&2
|
|
echo "Commit date is $commitDate" >&2
|
|
if test -n "$hash"; then
|
|
echo "hash is $hash" >&2
|
|
fi
|
|
fi
|
|
if test -n "$hash"; then
|
|
cat <<EOF
|
|
{
|
|
"url": "$(json_escape "$url")",
|
|
"rev": "$(json_escape "$fullRev")",
|
|
"date": "$(json_escape "$commitDateStrict8601")",
|
|
"path": "$(json_escape "$finalPath")",
|
|
"$(json_escape "$hashType")": "$(json_escape "$hash")",
|
|
"fetchSubmodules": $([[ -n "$fetchSubmodules" ]] && echo true || echo false),
|
|
"deepClone": $([[ -n "$deepClone" ]] && echo true || echo false),
|
|
"leaveDotGit": $([[ -n "$leaveDotGit" ]] && echo true || echo false)
|
|
}
|
|
EOF
|
|
fi
|
|
}
|
|
|
|
remove_tmpPath() {
|
|
rm -rf "$tmpPath"
|
|
}
|
|
|
|
if test -n "$QUIET"; then
|
|
quiet_mode
|
|
fi
|
|
|
|
if test -z "$branchName"; then
|
|
branchName=fetchgit
|
|
fi
|
|
|
|
if test -n "$builder"; then
|
|
test -n "$out" -a -n "$url" -a -n "$rev" || usage
|
|
mkdir -p "$out"
|
|
clone_user_rev "$out" "$url" "$rev"
|
|
else
|
|
if test -z "$hashType"; then
|
|
hashType=sha256
|
|
fi
|
|
|
|
# If the hash was given, a file with that hash may already be in the
|
|
# store.
|
|
if test -n "$expHash"; then
|
|
finalPath=$(nix-store --print-fixed-path --recursive "$hashType" "$expHash" "$(url_to_name "$url" "$rev")")
|
|
if ! nix-store --check-validity "$finalPath" 2> /dev/null; then
|
|
finalPath=
|
|
fi
|
|
hash=$expHash
|
|
fi
|
|
|
|
# If we don't know the hash or a path with that hash doesn't exist,
|
|
# download the file and add it to the store.
|
|
if test -z "$finalPath"; then
|
|
|
|
tmpPath="$(mktemp -d "${TMPDIR:-/tmp}/git-checkout-tmp-XXXXXXXX")"
|
|
exit_handlers+=(remove_tmpPath)
|
|
|
|
tmpFile="$tmpPath/$(url_to_name "$url" "$rev")"
|
|
mkdir -p "$tmpFile"
|
|
|
|
# Perform the checkout.
|
|
clone_user_rev "$tmpFile" "$url" "$rev"
|
|
|
|
# Compute the hash.
|
|
hash=$(nix-hash --type $hashType --base32 "$tmpFile")
|
|
|
|
# Add the downloaded file to the Nix store.
|
|
finalPath=$(nix-store --add-fixed --recursive "$hashType" "$tmpFile")
|
|
|
|
if test -n "$expHash" -a "$expHash" != "$hash"; then
|
|
echo "hash mismatch for URL \`$url'. Got \`$hash'; expected \`$expHash'." >&2
|
|
exit 1
|
|
fi
|
|
fi
|
|
|
|
print_results "$hash"
|
|
|
|
if test -n "$PRINT_PATH"; then
|
|
echo "$finalPath"
|
|
fi
|
|
fi
|