From a0c2b406481b7e9911027565c8643fc573419bd7 Mon Sep 17 00:00:00 2001 From: Tobias Ulmer Date: Tue, 10 Dec 2013 21:26:46 +0100 Subject: [PATCH] Integrate hashcache into hash module Use hash_file_once() most everywhere. Hashcache keeps track of number of cache hits and limits the cache to the 1000 most used. It's also much more strict in what is considered a match. Signed-off-by: Tobias Ulmer --- generic/e2lib.lua | 2 + generic/hash.lua | 226 +++++++++++++++++++++++++++++++++++++++++++++- local/digest.lua | 12 +-- local/e2tool.lua | 212 ++++++------------------------------------- 4 files changed, 253 insertions(+), 199 deletions(-) diff --git a/generic/e2lib.lua b/generic/e2lib.lua index ec02ad1..34b5e8b 100644 --- a/generic/e2lib.lua +++ b/generic/e2lib.lua @@ -58,6 +58,7 @@ local cache = require("cache") local eio = require("eio") local le2lib = require("le2lib") local trace = require("trace") +local hash = require("hash") e2lib.globals = strict.lock({ logflags = { @@ -659,6 +660,7 @@ function e2lib.finish(returncode) if not returncode then returncode = 0 end + hash.hcache_store() e2lib.cleanup() if e2lib.globals.debuglogfile then eio.fclose(e2lib.globals.debuglogfile) diff --git a/generic/hash.lua b/generic/hash.lua index 5172f1b..b575946 100644 --- a/generic/hash.lua +++ b/generic/hash.lua @@ -1,4 +1,4 @@ ---- Hash +--- Hash module with built-in caching. -- @module generic.hash --[[ @@ -30,11 +30,145 @@ ]] local hash = {} +local e2lib = require("e2lib") local eio = require("eio") local err = require("err") +local lsha1 = require("lsha1") local strict = require("strict") local trace = require("trace") -local lsha1 = require("lsha1") + +--- The hashcache lookup dictionary. +local hcachedict = false +--- Path to the persistent storage file. +local hcachestorage = false + +--- Internal hash cache entry dictionary. +-- @table hce +-- @field dev See stat. +-- @field ino See stat. +-- @field size See stat. +-- @field mtime See stat. +-- @field mtime_nsec See stat. +-- @field ctime See stat. +-- @field ctime_nsec See stat. +-- @field hash SHA-1 checksum +-- @field hit Count cache hits. + +--- Load or create the persistent hashcache file. +-- @param filename Path to hashcache file. If filename does not exists, it +-- will be created when calling hcache_store(). +-- @return True on success, false on error. Errors only have an effect on +-- performance, and should ususally be ignored. +-- @return Error object on failure. +-- @see hcache_store +function hash.hcache_load(filename) + local rc, re, hctab, chunk, msg + + if hcachedict then + return false, err.new("hashcache already initialised") + end + + hcachestorage = filename + + hctab = {} + chunk, msg = loadfile(filename) + if not chunk then + return false, err.new("loading hashcache %q failed: %s", filename, msg) + end + + -- set empty environment for this chunk + setfenv(chunk, {}) + hctab = chunk() + if type(hctab) ~= "table" then + return false, err.new("ignoring malformed hashcache %q", filename) + end + + for path,hce in pairs(hctab) do + if type(path) == "string" and #path > 0 + and type(hce.hash) == "string" and #hce.hash == 40 + and type(hce.mtime) == "number" + and type(hce.mtime_nsec) == "number" + and type(hce.ctime) == "number" + and type(hce.ctime_nsec) == "number" + and type(hce.size) == "number" + and type(hce.dev) == "number" + and type(hce.ino) == "number" + and type(hce.hit) == "number" then + + if not hcachedict then + hcachedict = {} + end + + hcachedict[path] = { + hash = hce.hash, + mtime = hce.mtime, + mtime_nsec = hce.mtime_nsec, + ctime = hce.ctime, + ctime_nsec = hce.ctime_nsec, + size = hce.size, + dev = hce.dev, + ino = hce.ino, + hit = hce.hit, + } + else + hcachedict = false + return false, + err.new("malformed hashcache entry, ignoring %q", filename) + end + end + + return true +end + +--- Save the hashcache to persistent storage, for later use. The hashcache file +-- location set by calling hcache_load(). +-- @return True on success, false on error. Errors should usually be ignored. +-- @return Error object on failure. +-- @see hcache_load +function hash.hcache_store() + local rc, re, hcachevec, e, out + + if not hcachedict or not hcachestorage then + return true + end + + hcachevec = {} + for path,hce in pairs(hcachedict) do + table.insert(hcachevec, {path=path, hce=hce}) + end + + local function comp(t1, t2) + if t1.hce.hit > t2.hce.hit then + return true + end + return false + end + + table.sort(hcachevec, comp) + + out = { "return {\n" } + for i,v in ipairs(hcachevec) do + table.insert(out, + string.format( + "[%q] = { hash=%q, mtime=%d, mtime_nsec=%d, ctime=%d, " .. + "ctime_nsec=%d, size=%d, dev=%d, ino=%d, hit=%d },\n", + v.path, v.hce.hash, v.hce.mtime, v.hce.mtime_nsec, v.hce.ctime, + v.hce.ctime_nsec, v.hce.size, v.hce.dev, v.hce.ino, v.hce.hit)) + + if v.hce.hit == 0 and i > 10000 then + break + end + end + table.insert(out, "}\n") + + rc, re = eio.file_write(hcachestorage, table.concat(out)) + if not rc then + e = err.new("writing hashcache file") + return false, e:cat(re) + end + + return true +end --- Create a hash context. -- @return Hash context object or false on error. @@ -127,13 +261,91 @@ function hash.hash_file(hc, path) return true end ---- Hash a file at once. +--- Lookup the checksum for a file in the hashcache. +-- @param path Absolute path to the file. +-- @return Checksum or false if path is not in the cache or an error occured. +local function hcache_lookup(path) + local sb, hce + + if not hcachedict then + return false + end + + -- Try not to return checksums for files which are inaccessible. + if not e2lib.exists(path, false) then + return false + end + + sb = e2lib.stat(path) + if not sb then + return false + end + + hce = hcachedict[path] + if not hce + or hce.mtime ~= sb.mtime + or hce.mtime_nsec ~= sb.mtime_nsec + or hce.ctime ~= sb.ctime + or hce.ctime_nsec ~= sb.ctime_nsec + or hce.size ~= sb.size + or hce.dev ~= sb.dev + or hce.ino ~= sb.ino then + + return false + end + + hce.hit = hce.hit + 1 + return hce.hash +end + +--- Add file and checksum to the hashcache. +-- @param path Path to the file. +-- @param hash SHA1 checksum string, length 40. +-- @return True on success, false on error. +local function hcache_add(path, hash) + assert(type(path) == "string" and #path > 0) + assert(type(hash) == "string" and #hash == 40) + + local sb + + if not hcachedict then + hcachedict = {} + end + + sb = e2lib.stat(path) + if not sb then + return false + end + + hcachedict[path] = { + hash = hash, + mtime = sb.mtime, + mtime_nsec = sb.mtime_nsec, + ctime = sb.ctime, + ctime_nsec = sb.ctime_nsec, + size = sb.size, + dev = sb.dev, + ino = sb.ino, + hit = 0, + } + + return true +end + +--- Hash a file at once. Unlike hash_file(), this function makes use of a +-- persistent cache. -- @param path Full path to the file. -- @return Checksum string, or false on error. -- @return Error object on failure. +-- @see hcache_load function hash.hash_file_once(path) local rc, re, hc, cs + cs = hcache_lookup(path) + if cs then + return cs + end + hc, re = hash.hash_start() if not hc then return false, re @@ -145,7 +357,13 @@ function hash.hash_file_once(path) return false, re end - return hash.hash_finish(hc) + cs, re = hash.hash_finish(hc) + if not cs then + return false, re + end + + hcache_add(path, cs) + return cs end --- Get checksum and release hash context. diff --git a/local/digest.lua b/local/digest.lua index eb6f35b..cd96fd2 100644 --- a/local/digest.lua +++ b/local/digest.lua @@ -282,17 +282,7 @@ local function compute_checksum_entry(pos, entry, directory, verify) if entry.digest == digest.SHA1 then -- XXX: We assume the hash module returns SHA1 checksums. Not nice. - local hc, re = hash.hash_start() - if not hc then - return false, re - end - - rc, re = hash.hash_file(hc, filename) - if not rc then - return false, re - end - - computedcs, re = hash.hash_finish(hc) + computedcs, re = hash.hash_file_once(filename) if not computedcs then return false, re end diff --git a/local/e2tool.lua b/local/e2tool.lua index c47e5e1..8dd5683 100644 --- a/local/e2tool.lua +++ b/local/e2tool.lua @@ -704,56 +704,6 @@ function e2tool.local_init(path, tool) return info end ---- hashcache setup. -local function hashcache_setup(info) - local e = err.new("reading hash cache") - local rc, re - e2lib.logf(4, "loading hashcache from file: %s", info.hashcache_file) - info.hashcache = {} - - local c, msg = loadfile(info.hashcache_file) - if not c then - e2lib.warnf("WHINT", "loading hashcache failed: %s", msg) - return true - end - -- set empty environment for this chunk - setfenv(c, {}) - local newcache = c() - - if type(newcache) ~= "table" then - e2lib.warnf("WHINT", "ignoring malformed hashcache") - return true - end - - for id, hce in pairs(newcache) do - if type(id) == "string" and id:match("([^:]+):(%S+)") - and type(hce.hash) == "string" and string.len(hce.hash) == 40 - and type(hce.mtime) == "number" - and type(hce.mtime_nsec) == "number" - and type(hce.ctime) == "number" - and type(hce.ctime_nsec) == "number" - and type(hce.size) == "number" - and type(hce.dev) == "number" - and type(hce.ino) == "number" then - - info.hashcache[id] = { - hash = hce.hash, - mtime = hce.mtime, - mtime_nsec = hce.mtime_nsec, - ctime = hce.ctime, - ctime_nsec = hce.ctime_nsec, - size = hce.size, - dev = hce.dev, - ino = hce.ino, - } - else - e2lib.warnf("WHINT", "ignoring malformed hashcache entry") - end - end - - return true -end - --- check for configuration syntax compatibility and log informational -- message including list of supported syntaxes if incompatibility is -- detected. @@ -1319,6 +1269,9 @@ function e2tool.collect_project_info(info, skip_load_config) e2lib.logf(4, "VERSION: %s", buildconfig.VERSION) e2lib.logf(4, "VERSIONSTRING: %s", buildconfig.VERSIONSTRING) + hash.hcache_load(e2lib.join(info.root, ".e2/hashcache")) + -- no error check required + --XXX create some policy module where the following policy settings --XXX and functions reside (server names, paths, etc.) @@ -1351,12 +1304,6 @@ function e2tool.collect_project_info(info, skip_load_config) -- if x86_64 mode is requested. info.chroot_call_prefix["x86_64"] = "" - info.hashcache_file = e2lib.join(info.root, ".e2/hashcache") - rc, re = hashcache_setup(info) - if not rc then - return false, e:cat(re) - end - if e2option.opts["check"] then local f = e2lib.join(info.root, e2lib.globals.e2version_file) local v, re = e2lib.parse_e2versionfile(f) @@ -1860,26 +1807,6 @@ function e2tool.dsort(info) return e2tool.dlist_recursive(info, info.project.default_results) end ---- hash a file addressed by server name and location. --- @param info info structure --- @param server the server name --- @param location file location relative to the server --- @return string the hash value, nil on error --- @return nil, an error string on error -local function hash_file(info, server, location) - local e = err.new("error hashing file") - local cache_flags = { cache = true } - local rc, re = info.cache:cache_file(server, location, cache_flags) - if not rc then - return nil, e:cat(re) - end - local path, re = info.cache:file_path(server, location, cache_flags) - if not path then - return nil, e:cat(re) - end - return hash.hash_file_once(path) -end - --- verify that a file addressed by server name and location matches the -- checksum given in the sha1 parameter. -- @param info info structure @@ -1891,7 +1818,7 @@ end function e2tool.verify_hash(info, server, location, sha1) local rc, re local e = err.new("error verifying checksum") - local is_sha1, re = hash_file(info, server, location) + local is_sha1, re = e2tool.fileid(info, {server=server, location=location}) if not is_sha1 then return false, e:cat(re) end @@ -1939,87 +1866,6 @@ local function projid(info) return info.projid end ---- Write out hashcache file. --- @param info Info table. --- @return True on success, false on error. --- @return Error object on failure. -local function hashcache_write(info) - local rc, re, e, out - - out = { "return {\n" } - for k,hce in pairs(info.hashcache) do - table.insert(out, string.format( - "[%q] = { hash=%q, mtime=%d, mtime_nsec=%d, ctime=%d, " .. - "ctime_nsec=%d, size=%d, dev=%d, ino=%d },\n", - k, hce.hash, hce.mtime, hce.mtime_nsec, - hce.ctime, hce.ctime_nsec, hce.size, hce.dev, hce.ino)) - end - table.insert(out, "}\n") - - rc, re = eio.file_write(info.hashcache_file, table.concat(out)) - if not rc then - e = err.new("writing hash cache file") - return false, e:cat(re) - end - - return true -end - ---- hashcache. -local function hashcache(info, file) - local e = err.new("getting fileid from hash cache failed") - local rc, re, fileid - - local p, re = info.cache:file_path(file.server, file.location, {}) - if not p then - return nil, e:cat(re) - end - local s, re = e2lib.stat(p) - if not s then - return nil, e:cat(re) - end - - local id = string.format("%s:%s", file.server, file.location) - local hce = info.hashcache[id] - if hce - -- We don't just care about the file contents (mtime), - -- inode changes could make the file inaccessible, so check ctime too - and s.mtime == hce.mtime - and s.mtime_nsec == hce.mtime_nsec - and s.ctime == hce.ctime - and s.ctime_nsec == hce.ctime_nsec - and s.size == hce.size - and s.dev == hce.dev - and s.ino == hce.ino then - assert(type(hce.hash) == "string" and string.len(hce.hash) == 40) - return hce.hash - end - - local fileid - fileid, re = hash_file(info, file.server, file.location) - if not fileid then - return nil, e:cat(re) - end - - assert(type(fileid) == "string" and string.len(fileid) == 40) - hce = { - hash = fileid, - mtime = s.mtime, - mtime_nsec = s.mtime_nsec, - ctime = s.ctime, - ctime_nsec = s.ctime_nsec, - size = s.size, - dev = s.dev, - ino = s.ino, - } - -- update hashcache and the hashcachefile - -- TBD: mark hashcache dirty and write hashcachefile once. - info.hashcache[id] = hce - hashcache_write(info) -- an error here is not fatal - - return fileid -end - --- verify that remote files match the checksum. The check is skipped when -- check-remote is not enabled or cache is not enabled. -- @param info @@ -2075,15 +1921,7 @@ local function verify_remote_fileid(info, file, fileid) e:cat(err.new("Could not extract digest from digest table")) end elseif u.transport == "file" then - hc, re = hash.hash_start() - if not hc then - return false, e:cat(re) - end - rc, re = hash.hash_file(hc, e2lib.join("/", u.path)) - if not rc then - return false, e:cat(re) - end - remote_fileid, re = hash.hash_finish(hc) + remote_fileid, re = hash.hash_file_once(e2lib.join("/", u.path)) if not remote_fileid then return false, e:cat(re) end @@ -2104,15 +1942,7 @@ local function verify_remote_fileid(info, file, fileid) return false, e:cat(re) end - hc, re = hash.hash_start() - if not hc then - return false, e:cat(re) - end - rc, re = hash.hash_file(hc, tmpfile) - if not rc then - return false, e:cat(re) - end - remote_fileid, re = hash.hash_finish(hc) + remote_fileid, re = hash.hash_file_once(tmpfile) if not remote_fileid then return false, e:cat(re) end @@ -2140,22 +1970,36 @@ end -- @return fileid string: hash value, or nil -- @return an error object on failure function e2tool.fileid(info, file) - local fileid - local re - local e = err.new("error calculating file id for file: %s:%s", - file.server, file.location) + local rc, re, e, fileid, path + local cache_flags = { cache = true } + + e = err.new("error calculating file id for file: %s:%s", + file.server, file.location) + if file.sha1 then fileid = file.sha1 else - fileid, re = hashcache(info, file) + rc, re = info.cache:cache_file(file.server, file.location, cache_flags) + if not rc then + return false, e:cat(re) + end + + path, re = info.cache:file_path(file.server, file.location, cache_flags) + if not path then + return false, e:cat(re) + end + + fileid, re = hash.hash_file_once(path) if not fileid then - return nil, e:cat(re) + return false, e:cat(re) end end - local rc, re = verify_remote_fileid(info, file, fileid) + + rc, re = verify_remote_fileid(info, file, fileid) if not rc then - return nil, re + return false, e:cat(re) end + return fileid end -- 2.39.5