Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
mm/vmscan.c | 3 +++
1 files changed, 3 insertions(+), 0 deletions(-)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 5fa3eda..ae44189 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2034,6 +2034,9 @@ void wakeup_kswapd(struct zone *zone, int order)
if (!populated_zone(zone))
return;
+ if (freezer_is_on())
+ return;
+
pgdat = zone->zone_pgdat;
if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0))
return;
--
1.5.6.3
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
include/linux/freezer.h | 3 +++
kernel/power/process.c | 4 ++++
2 files changed, 7 insertions(+), 0 deletions(-)
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 5a361f8..4f50655 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -121,6 +121,8 @@ static inline void set_freezable(void)
current->flags &= ~PF_NOFREEZE;
}
+extern void thaw_kernel_threads(void);
+
/*
* Tell the freezer that the current task should be frozen by it and that it
* should send a fake signal to the task to freeze it.
@@ -172,6 +174,7 @@ static inline int freeze_processes(void) { BUG(); return 0; }
static inline void thaw_processes(void) {}
static inline int try_to_freeze(void) { return 0; }
+static inline void thaw_kernel_threads(void) { }
static inline void freezer_do_not_count(void) {}
static inline void freezer_count(void) {}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index ca63401..d464ef7 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -152,3 +152,7 @@ void thaw_processes(void)
printk("done.\n");
}
+void thaw_kernel_threads(void)
+{
+ thaw_tasks(true);
+}
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/power.h | 3 +++
kernel/power/snapshot.c | 13 ++++++++-----
2 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/kernel/power/power.h b/kernel/power/power.h
index ce81df1..4cc59d5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -287,6 +287,9 @@ struct memory_bitmap {
* objects
*/
struct bm_position cur; /* most recently used bit position */
+ struct bm_position iter; /* most recently used bit position
+ * when iterating over a bitmap.
+ */
};
extern int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 8020644..786227c 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -234,6 +234,9 @@ void memory_bm_position_reset(struct memory_bitmap *bm)
{
bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
bm->cur.bit = 0;
+
+ bm->iter.block = list_entry(bm->blocks.next, struct bm_block, hook);
+ bm->iter.bit = 0;
}
/**
@@ -519,23 +522,23 @@ unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
struct bm_block *bb;
int bit;
- bb = bm->cur.block;
+ bb = bm->iter.block;
do {
- bit = bm->cur.bit;
+ bit = bm->iter.bit;
bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
if (bit < bm_block_bits(bb))
goto Return_pfn;
bb = list_entry(bb->hook.next, struct bm_block, hook);
- bm->cur.block = bb;
- bm->cur.bit = 0;
+ bm->iter.block = bb;
+ bm->iter.bit = 0;
} while (&bb->hook != &bm->blocks);
memory_bm_position_reset(bm);
return BM_END_OF_MAP;
Return_pfn:
- bm->cur.bit = bit + 1;
+ bm->iter.bit = bit + 1;
return bb->start_pfn + bit;
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/Kconfig | 15 ++
kernel/power/Makefile | 1 +
kernel/power/tuxonice_compress.c | 434 ++++++++++++++++++++++++++++++++++++++
3 files changed, 450 insertions(+), 0 deletions(-)
create mode 100644 kernel/power/tuxonice_compress.c
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 498b5c2..1b474a6 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -195,6 +195,21 @@ menuconfig TOI_CORE
comment "General Options"
depends on TOI_CORE
+ config TOI_CRYPTO
+ bool "Compression support"
+ depends on TOI_CORE && CRYPTO
+ default y
+ ---help---
+ This option adds support for using cryptoapi compression
+ algorithms. Compression is particularly useful as it can
+ more than double your suspend and resume speed (depending
+ upon how well your image compresses).
+
+ You probably want this, so say Y here.
+
+ comment "No compression support available without Cryptoapi support."
+ depends on TOI_CORE && !CRYPTO
+
config TOI_KEEP_IMAGE
bool "Allow Keep Image Mode"
depends on TOI_CORE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 9ad139e..3803866 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -15,6 +15,7 @@ tuxonice_core-objs += tuxonice_storage.o tuxonice_netlink.o
endif
obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
+obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
new file mode 100644
index 0000000..1ce3d02
--- /dev/null
+++ b/kernel/power/tuxonice_compress.c
@@ -0,0 +1,434 @@
+/*
+ * kernel/power/compression.c
+ *
+ * Copyright (C) 2003-2008 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * This file is released under the GPLv2.
+ *
+ * This file contains data compression routines for TuxOnIce,
+ * using cryptoapi.
+ */
+
+#include <linux/suspend.h>
+#include <linux/highmem.h>
+#include <linux/vmalloc.h>
+#include <linux/crypto.h>
+
+#include "tuxonice_builtin.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+
+static int toi_expected_compression;
+
+static struct toi_module_ops toi_compression_ops;
+static struct toi_module_ops *next_driver;
+
+static char toi_compressor_name[32] = "lzo";
+
+static DEFINE_MUTEX(stats_lock);
+
+struct cpu_context {
+ u8 *page_buffer;
+ struct crypto_comp *transform;
+ unsigned int len;
+ char *buffer_start;
+ char *output_buffer;
+};
+
+static DEFINE_PER_CPU(struct cpu_context, contexts);
+
+static int toi_compress_prepare_result;
+
+/*
+ * toi_compress_cleanup
+ *
+ * Frees memory allocated for our labours.
+ */
+static void toi_compress_cleanup(int toi_or_resume)
+{
+ int cpu;
+
+ if (!toi_or_resume)
+ return;
+
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ if (this->transform) {
+ crypto_free_comp(this->transform);
+ this->transform = NULL;
+ }
+
+ if (this->page_buffer)
+ toi_free_page(16, (unsigned long) this->page_buffer);
+
+ this->page_buffer = NULL;
+
+ if (this->output_buffer)
+ vfree(this->output_buffer);
+
+ this->output_buffer = NULL;
+ }
+}
+
+/*
+ * toi_crypto_prepare
+ *
+ * Prepare to do some work by allocating buffers and transforms.
+ */
+static int toi_compress_crypto_prepare(void)
+{
+ int cpu;
+
+ if (!*toi_compressor_name) {
+ printk(KERN_INFO "TuxOnIce: Compression enabled but no "
+ "compressor name set.\n");
+ return 1;
+ }
+
+ for_each_online_cpu(cpu) {
+ struct cpu_context *this = &per_cpu(contexts, cpu);
+ this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
+ if (IS_ERR(this->transform)) {
+ printk(KERN_INFO "TuxOnIce: Failed to initialise the "
+ "%s compression transform.\n",
+ toi_compressor_name);
+ this->transform = NULL;
+ return 1;
+ }
+
+ this->page_buffer =
+ (char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
+
+ if (!this->page_buffer) {
+ printk(KERN_ERR
+ "Failed to allocate a page buffer for TuxOnIce "
+ "compression driver.\n");
+ return -ENOMEM;
+ }
+
+ this->output_buffer =
+ (char *) vmalloc_32(2 * PAGE_SIZE);
+
+ if (!this->output_buffer) {
+ printk(KERN_ERR
+ "Failed to allocate a output buffer for TuxOnIce "
+ "compression driver.\n");
+ return -ENOMEM;
+ }
+
+ }
+
+ return 0;
+}
+
+/*
+ * toi_compress_init
+ */
+
+static int toi_compress_init(int toi_or_resume)
+{
+ if (!toi_or_resume)
+ return 0;
+
+ toi_compress_bytes_in = 0;
+ toi_compress_bytes_out = 0;
+
+ next_driver = toi_get_next_filter(&toi_compression_ops);
+
+ if (!next_driver)
+ return -ECHILD;
+
+ toi_compress_prepare_result = toi_compress_crypto_prepare();
+
+ return 0;
+}
+
+/*
+ * toi_compress_rw_init()
+ */
+
+static int toi_compress_rw_init(int rw, int stream_number)
+{
+ if (toi_compress_prepare_result) {
+ printk(KERN_ERR "Failed to initialise compression "
+ "algorithm.\n");
+ if (rw == READ) {
+ printk(KERN_INFO "Unable to read the image.\n");
+ return -ENODEV;
+ } else {
+ printk(KERN_INFO "Continuing without "
+ "compressing the image.\n");
+ toi_compression_ops.enabled = 0;
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * toi_compress_write_page()
+ *
+ * Compress a page of data, buffering output and passing on filled
+ * pages to the next module in the pipeline.
+ *
+ * Buffer_page: Pointer to a buffer of size PAGE_SIZE, containing
+ * data to be compressed.
+ *
+ * Returns: 0 on success. Otherwise the error is that returned by later
+ * modules, -ECHILD if we have a broken pipeline or -EIO if
+ * zlib errs.
+ */
+static int toi_compress_write_page(unsigned long index,
+ struct page *buffer_page, unsigned int buf_size)
+{
+ int ret, cpu = smp_processor_id();
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+ if (!ctx->transform)
+ return next_driver->write_page(index, buffer_page, buf_size);
+
+ ctx->buffer_start = kmap(buffer_page);
+
+ ctx->len = buf_size;
+
+ ret = crypto_comp_compress(ctx->transform,
+ ctx->buffer_start, buf_size,
+ ctx->output_buffer, &ctx->len);
+
+ kunmap(buffer_page);
+
+ mutex_lock(&stats_lock);
+ toi_compress_bytes_in += buf_size;
+ toi_compress_bytes_out += ctx->len;
+ mutex_unlock(&stats_lock);
+
+ if (!ret && ctx->len < buf_size) { /* some compression */
+ memcpy(ctx->page_buffer, ctx->output_buffer, ctx->len);
+ return next_driver->write_page(index,
+ virt_to_page(ctx->page_buffer),
+ ctx->len);
+ } else
+ return next_driver->write_page(index, buffer_page, buf_size);
+}
+
+/*
+ * toi_compress_read_page()
+ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Retrieve data from later modules and decompress it until the input buffer
+ * is filled.
+ * Zero if successful. Error condition from me or from downstream on failure.
+ */
+static int toi_compress_read_page(unsigned long *index,
+ struct page *buffer_page, unsigned int *buf_size)
+{
+ int ret, cpu = smp_processor_id();
+ unsigned int len;
+ unsigned int outlen = PAGE_SIZE;
+ char *buffer_start;
+ struct cpu_context *ctx = &per_cpu(contexts, cpu);
+
+ if (!ctx->transform)
+ return next_driver->read_page(index, buffer_page, buf_size);
+
+ /*
+ * All our reads must be synchronous - we can't decompress
+ * data that hasn't been read yet.
+ */
+
+ *buf_size = PAGE_SIZE;
+
+ ret = next_driver->read_page(index, buffer_page, &len);
+
+ /* Error or uncompressed data */
+ if (ret || len == PAGE_SIZE)
+ return ret;
+
+ buffer_start = kmap(buffer_page);
+ memcpy(ctx->page_buffer, buffer_start, len);
+ ret = crypto_comp_decompress(
+ ctx->transform,
+ ctx->page_buffer,
+ len, buffer_start, &outlen);
+ if (ret)
+ abort_hibernate(TOI_FAILED_IO,
+ "Compress_read returned %d.\n", ret);
+ else if (outlen != PAGE_SIZE) {
+ abort_hibernate(TOI_FAILED_IO,
+ "Decompression yielded %d bytes instead of %ld.\n",
+ outlen, PAGE_SIZE);
+ printk(KERN_ERR "Decompression yielded %d bytes instead of "
+ "%ld.\n", outlen, PAGE_SIZE);
+ ret = -EIO;
+ *buf_size = outlen;
+ }
+ kunmap(buffer_page);
+ return ret;
+}
+
+/*
+ * toi_compress_print_debug_stats
+ * @buffer: Pointer to a buffer into which the debug info will be printed.
+ * @size: Size of the buffer.
+ *
+ * Print information to be recorded for debugging purposes into a buffer.
+ * Returns: Number of characters written to the buffer.
+ */
+
+static int toi_compress_print_debug_stats(char *buffer, int size)
+{
+ unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
+ pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
+ int len;
+
+ /* Output the compression ratio achieved. */
+ if (*toi_compressor_name)
+ len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
+ toi_compressor_name);
+ else
+ len = scnprintf(buffer, size, "- Compressor is not set.\n");
+
+ if (pages_in)
+ len += scnprintf(buffer+len, size - len, " Compressed "
+ "%lu bytes into %lu (%ld percent compression).\n",
+ toi_compress_bytes_in,
+ toi_compress_bytes_out,
+ (pages_in - pages_out) * 100 / pages_in);
+ return len;
+}
+
+/*
+ * toi_compress_compression_memory_needed
+ *
+ * Tell the caller how much memory we need to operate during hibernate/resume.
+ * Returns: Unsigned long. Maximum number of bytes of memory required for
+ * operation.
+ */
+static int toi_compress_memory_needed(void)
+{
+ return 2 * PAGE_SIZE;
+}
+
+static int toi_compress_storage_needed(void)
+{
+ return 4 * sizeof(unsigned long) + strlen(toi_compressor_name) + 1;
+}
+
+/*
+ * toi_compress_save_config_info
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save informaton needed when reloading the image at resume time.
+ * Returns: Number of bytes used for saving our data.
+ */
+static int toi_compress_save_config_info(char *buffer)
+{
+ int namelen = strlen(toi_compressor_name) + 1;
+ int total_len;
+
+ *((unsigned long *) buffer) = toi_compress_bytes_in;
+ *((unsigned long *) (buffer + 1 * sizeof(unsigned long))) =
+ toi_compress_bytes_out;
+ *((unsigned long *) (buffer + 2 * sizeof(unsigned long))) =
+ toi_expected_compression;
+ *((unsigned long *) (buffer + 3 * sizeof(unsigned long))) = namelen;
+ strncpy(buffer + 4 * sizeof(unsigned long), toi_compressor_name,
+ namelen);
+ total_len = 4 * sizeof(unsigned long) + namelen;
+ return total_len;
+}
+
+/* toi_compress_load_config_info
+ * @buffer: Pointer to the start of the data.
+ * @size: Number of bytes that were saved.
+ *
+ * Description: Reload information needed for decompressing the image at
+ * resume time.
+ */
+static void toi_compress_load_config_info(char *buffer, int size)
+{
+ int namelen;
+
+ toi_compress_bytes_in = *((unsigned long *) buffer);
+ toi_compress_bytes_out = *((unsigned long *) (buffer + 1 *
+ sizeof(unsigned long)));
+ toi_expected_compression = *((unsigned long *) (buffer + 2 *
+ sizeof(unsigned long)));
+ namelen = *((unsigned long *) (buffer + 3 * sizeof(unsigned long)));
+ if (strncmp(toi_compressor_name, buffer + 4 * sizeof(unsigned long),
+ namelen)) {
+ toi_compress_cleanup(1);
+ strncpy(toi_compressor_name, buffer + 4 * sizeof(unsigned long),
+ namelen);
+ toi_compress_crypto_prepare();
+ }
+ return;
+}
+
+/*
+ * toi_expected_compression_ratio
+ *
+ * Description: Returns the expected ratio between data passed into this module
+ * and the amount of data output when writing.
+ * Returns: 100 if the module is disabled. Otherwise the value set by the
+ * user via our sysfs entry.
+ */
+
+static int toi_compress_expected_ratio(void)
+{
+ if (!toi_compression_ops.enabled)
+ return 100;
+ else
+ return 100 - toi_expected_compression;
+}
+
+/*
+ * data for our sysfs entries.
+ */
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
+ 0, 99, 0, NULL),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
+ NULL),
+ SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
+};
+
+/*
+ * Ops structure.
+ */
+static struct toi_module_ops toi_compression_ops = {
+ .type = FILTER_MODULE,
+ .name = "compression",
+ .directory = "compression",
+ .module = THIS_MODULE,
+ .initialise = toi_compress_init,
+ .cleanup = toi_compress_cleanup,
+ .memory_needed = toi_compress_memory_needed,
+ .print_debug_info = toi_compress_print_debug_stats,
+ .save_config_info = toi_compress_save_config_info,
+ .load_config_info = toi_compress_load_config_info,
+ .storage_needed = toi_compress_storage_needed,
+ .expected_compression = toi_compress_expected_ratio,
+
+ .rw_init = toi_compress_rw_init,
+
+ .write_page = toi_compress_write_page,
+ .read_page = toi_compress_read_page,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+
+static __init int toi_compress_load(void)
+{
+ return toi_register_module(&toi_compression_ops);
+}
+
+late_initcall(toi_compress_load);
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
include/linux/netlink.h | 1 +
kernel/power/Makefile | 2 +-
kernel/power/tuxonice_storage.c | 282 +++++++++++++++++++++++++++++++++++++++
kernel/power/tuxonice_storage.h | 10 ++
4 files changed, 294 insertions(+), 1 deletions(-)
create mode 100644 kernel/power/tuxonice_storage.c
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index 5ba398e..d597e15 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -24,6 +24,7 @@
/* leave room for NETLINK_DM (DM Events) */
#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */
#define NETLINK_ECRYPTFS 19
+#define NETLINK_TOI_USM 21 /* Userspace storage manager */
#define MAX_LINKS 32
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 180b89a..9ad139e 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -11,7 +11,7 @@ tuxonice_core-objs := tuxonice_modules.o tuxonice_sysfs.o tuxonice_highlevel.o \
obj-$(CONFIG_TOI) += tuxonice_builtin.o
ifdef CONFIG_NET
-tuxonice_core-objs += tuxonice_netlink.o
+tuxonice_core-objs += tuxonice_storage.o tuxonice_netlink.o
endif
obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
new file mode 100644
index 0000000..5dafc95
--- /dev/null
+++ b/kernel/power/tuxonice_storage.c
@@ -0,0 +1,282 @@
+/*
+ * kernel/power/tuxonice_storage.c
+ *
+ * Copyright (C) 2005-2008 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for talking to a userspace program that manages storage.
+ *
+ * The kernel side:
+ * - starts the userspace program;
+ * - sends messages telling it when to open and close the connection;
+ * - tells it when to quit;
+ *
+ * The user space side:
+ * - passes messages regarding status;
+ *
+ */
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_ui.h"
+
+static struct user_helper_data usm_helper_data;
+static struct toi_module_ops usm_ops;
+static int message_received, usm_prepare_count;
+static int storage_manager_last_action, storage_manager_action;
+
+static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int type;
+ int *data;
+
+ type = nlh->nlmsg_type;
+
+ /* A control message: ignore them */
+ if (type < NETLINK_MSG_BASE)
+ return 0;
+
+ /* Unknown message: reply with EINVAL */
+ if (type >= USM_MSG_MAX)
+ return -EINVAL;
+
+ /* All operations require privileges, even GET */
+ if (security_netlink_recv(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* Only allow one task to receive NOFREEZE privileges */
+ if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
+ return -EBUSY;
+
+ data = (int *) NLMSG_DATA(nlh);
+
+ switch (type) {
+ case USM_MSG_SUCCESS:
+ case USM_MSG_FAILED:
+ message_received = type;
+ complete(&usm_helper_data.wait_for_process);
+ break;
+ default:
+ printk(KERN_INFO "Storage manager doesn't recognise "
+ "message %d.\n", type);
+ }
+
+ return 1;
+}
+
+#ifdef CONFIG_NET
+static int activations;
+
+int toi_activate_storage(int force)
+{
+ int tries = 1;
+
+ if (usm_helper_data.pid == -1 || !usm_ops.enabled)
+ return 0;
+
+ message_received = 0;
+ activations++;
+
+ if (activations > 1 && !force)
+ return 0;
+
+ while ((!message_received || message_received == USM_MSG_FAILED) &&
+ tries < 2) {
+ toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
+ "%d.\n", tries);
+
+ init_completion(&usm_helper_data.wait_for_process);
+
+ toi_send_netlink_message(&usm_helper_data,
+ USM_MSG_CONNECT,
+ NULL, 0);
+
+ /* Wait 2 seconds for the userspace process to make contact */
+ wait_for_completion_timeout(&usm_helper_data.wait_for_process,
+ 2*HZ);
+
+ tries++;
+ }
+
+ return 0;
+}
+
+int toi_deactivate_storage(int force)
+{
+ if (usm_helper_data.pid == -1 || !usm_ops.enabled)
+ return 0;
+
+ message_received = 0;
+ activations--;
+
+ if (activations && !force)
+ return 0;
+
+ init_completion(&usm_helper_data.wait_for_process);
+
+ toi_send_netlink_message(&usm_helper_data,
+ USM_MSG_DISCONNECT,
+ NULL, 0);
+
+ wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
+
+ if (!message_received || message_received == USM_MSG_FAILED) {
+ printk(KERN_INFO "Returning failure disconnecting storage.\n");
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
+static void storage_manager_simulate(void)
+{
+ printk(KERN_INFO "--- Storage manager simulate ---\n");
+ toi_prepare_usm();
+ schedule();
+ printk(KERN_INFO "--- Activate storage 1 ---\n");
+ toi_activate_storage(1);
+ schedule();
+ printk(KERN_INFO "--- Deactivate storage 1 ---\n");
+ toi_deactivate_storage(1);
+ schedule();
+ printk(KERN_INFO "--- Cleanup usm ---\n");
+ toi_cleanup_usm();
+ schedule();
+ printk(KERN_INFO "--- Storage manager simulate ends ---\n");
+}
+
+static int usm_storage_needed(void)
+{
+ return strlen(usm_helper_data.program);
+}
+
+static int usm_save_config_info(char *buf)
+{
+ int len = strlen(usm_helper_data.program);
+ memcpy(buf, usm_helper_data.program, len);
+ return len;
+}
+
+static void usm_load_config_info(char *buf, int size)
+{
+ /* Don't load the saved path if one has already been set */
+ if (usm_helper_data.program[0])
+ return;
+
+ memcpy(usm_helper_data.program, buf, size);
+}
+
+static int usm_memory_needed(void)
+{
+ /* ball park figure of 32 pages */
+ return 32 * PAGE_SIZE;
+}
+
+/* toi_prepare_usm
+ */
+int toi_prepare_usm(void)
+{
+ usm_prepare_count++;
+
+ if (usm_prepare_count > 1 || !usm_ops.enabled)
+ return 0;
+
+ usm_helper_data.pid = -1;
+
+ if (!*usm_helper_data.program)
+ return 0;
+
+ toi_netlink_setup(&usm_helper_data);
+
+ if (usm_helper_data.pid == -1)
+ printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
+ " start it.\n");
+
+ toi_activate_storage(0);
+
+ return usm_helper_data.pid != -1;
+}
+
+void toi_cleanup_usm(void)
+{
+ usm_prepare_count--;
+
+ if (usm_helper_data.pid > -1 && !usm_prepare_count) {
+ toi_deactivate_storage(0);
+ toi_netlink_close(&usm_helper_data);
+ }
+}
+
+static void storage_manager_activate(void)
+{
+ if (storage_manager_action == storage_manager_last_action)
+ return;
+
+ if (storage_manager_action)
+ toi_prepare_usm();
+ else
+ toi_cleanup_usm();
+
+ storage_manager_last_action = storage_manager_action;
+}
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
+ SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
+ SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
+ NULL),
+ SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
+ 0, storage_manager_activate)
+};
+
+static struct toi_module_ops usm_ops = {
+ .type = MISC_MODULE,
+ .name = "usm",
+ .directory = "storage_manager",
+ .module = THIS_MODULE,
+ .storage_needed = usm_storage_needed,
+ .save_config_info = usm_save_config_info,
+ .load_config_info = usm_load_config_info,
+ .memory_needed = usm_memory_needed,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* toi_usm_sysfs_init
+ * Description: Boot time initialisation for user interface.
+ */
+int toi_usm_init(void)
+{
+ usm_helper_data.nl = NULL;
+ usm_helper_data.program[0] = '\0';
+ usm_helper_data.pid = -1;
+ usm_helper_data.skb_size = 0;
+ usm_helper_data.pool_limit = 6;
+ usm_helper_data.netlink_id = NETLINK_TOI_USM;
+ usm_helper_data.name = "userspace storage manager";
+ usm_helper_data.rcv_msg = usm_user_rcv_msg;
+ usm_helper_data.interface_version = 2;
+ usm_helper_data.must_init = 0;
+ init_completion(&usm_helper_data.wait_for_process);
+
+ return toi_register_module(&usm_ops);
+}
+
+void toi_usm_exit(void)
+{
+ toi_netlink_close_complete(&usm_helper_data);
+ toi_unregister_module(&usm_ops);
+}
diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
index af48608..24f8e8a 100644
--- a/kernel/power/tuxonice_storage.h
+++ b/kernel/power/tuxonice_storage.h
@@ -6,6 +6,15 @@
* This file is released under the GPLv2.
*/
+#ifdef CONFIG_NET
+int toi_prepare_usm(void);
+void toi_cleanup_usm(void);
+
+int toi_activate_storage(int force);
+int toi_deactivate_storage(int force);
+extern int toi_usm_init(void);
+extern void toi_usm_exit(void);
+#else
static inline int toi_usm_init(void) { return 0; }
static inline void toi_usm_exit(void) { }
@@ -21,6 +30,7 @@ static inline int toi_deactivate_storage(int force)
static inline int toi_prepare_usm(void) { return 0; }
static inline void toi_cleanup_usm(void) { }
+#endif
enum {
USM_MSG_BASE = 0x10,
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
include/linux/suspend.h | 14 ++++++++++++++
init/do_mounts.c | 2 ++
init/do_mounts_initrd.c | 6 ++++++
kernel/power/snapshot.c | 1 +
kernel/power/tuxonice_highlevel.c | 3 +++
5 files changed, 26 insertions(+), 0 deletions(-)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index 8faa15c..1f52617 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -359,5 +359,19 @@ extern void try_tuxonice_resume(void);
#else
#define try_tuxonice_resume() do { } while (0)
#endif
+
+extern int resume_attempted;
+extern int software_resume(void);
+
+static inline void check_resume_attempted(void)
+{
+ if (resume_attempted)
+ return;
+
+ software_resume();
+}
+#else
+#define check_resume_attempted() do { } while (0)
+#define resume_attempted (0)
#endif
#endif /* _LINUX_SUSPEND_H */
diff --git a/init/do_mounts.c b/init/do_mounts.c
index dd7ee5f..605adc5 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -412,6 +412,8 @@ void __init prepare_namespace(void)
if (is_floppy && rd_doload && rd_load_disk(0))
ROOT_DEV = Root_RAM0;
+ check_resume_attempted();
+
mount_root();
out:
sys_mount(".", "/", NULL, MS_MOVE, NULL);
diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
index 614241b..f3ea292 100644
--- a/init/do_mounts_initrd.c
+++ b/init/do_mounts_initrd.c
@@ -6,6 +6,7 @@
#include <linux/romfs_fs.h>
#include <linux/initrd.h>
#include <linux/sched.h>
+#include <linux/suspend.h>
#include <linux/freezer.h>
#include "do_mounts.h"
@@ -68,6 +69,11 @@ static void __init handle_initrd(void)
current->flags &= ~PF_FREEZER_SKIP;
+ if (!resume_attempted)
+ printk(KERN_ERR "TuxOnIce: No attempt was made to resume from "
+ "any image that might exist.\n");
+ clear_toi_state(TOI_BOOT_TIME);
+
/* move initrd to rootfs' /old */
sys_fchdir(old_fd);
sys_mount("/", ".", NULL, MS_MOVE, NULL);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index df70aff..bae4278 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -47,6 +47,7 @@ static void swsusp_unset_page_forbidden(struct page *);
* directly to their "original" page frames.
*/
struct pbe *restore_pblist;
+int resume_attempted;
/* Pointer to an auxiliary buffer (1 page) */
static void *buffer;
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
index f2ba0bb..06f97e7 100644
--- a/kernel/power/tuxonice_highlevel.c
+++ b/kernel/power/tuxonice_highlevel.c
@@ -970,6 +970,7 @@ out:
void toi_try_resume(void)
{
set_toi_state(TOI_TRYING_TO_RESUME);
+ resume_attempted = 1;
current->flags |= PF_MEMALLOC;
@@ -994,6 +995,8 @@ void toi_try_resume(void)
**/
static void toi_sys_power_disk_try_resume(void)
{
+ resume_attempted = 1;
+
/*
* There's a comment in kernel/power/disk.c that indicates
* we should be able to use mutex_lock_nested below. That
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
drivers/gpu/drm/drm_gem.c | 3 ++-
include/linux/fs.h | 2 ++
include/linux/mm.h | 1 +
mm/shmem.c | 2 ++
4 files changed, 7 insertions(+), 1 deletions(-)
diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
index 4984aa8..c69b548 100644
--- a/drivers/gpu/drm/drm_gem.c
+++ b/drivers/gpu/drm/drm_gem.c
@@ -136,7 +136,8 @@ drm_gem_object_alloc(struct drm_device *dev, size_t size)
obj = kcalloc(1, sizeof(*obj), GFP_KERNEL);
obj->dev = dev;
- obj->filp = shmem_file_setup("drm mm object", size, VM_NORESERVE);
+ obj->filp = shmem_file_setup("drm mm object", size,
+ VM_NORESERVE | VM_ATOMIC_COPY);
if (IS_ERR(obj->filp)) {
kfree(obj);
return NULL;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 5bed436..344b1ca 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -231,6 +231,8 @@ struct inodes_stat_t {
#define S_NOCMTIME 128 /* Do not update file c/mtime */
#define S_SWAPFILE 256 /* Do not truncate: swapon got its bmaps */
#define S_PRIVATE 512 /* Inode is fs-internal */
+#define S_ATOMIC_COPY 1024 /* Pages mapped with this inode need to be
+ atomically copied (gem) */
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
diff --git a/include/linux/mm.h b/include/linux/mm.h
index bff1f0d..3daeccb 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -105,6 +105,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */
#define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */
#define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */
+#define VM_ATOMIC_COPY 0x80000000 /* TuxOnIce should atomically copy */
#ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */
#define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff --git a/mm/shmem.c b/mm/shmem.c
index b25f95c..4908d20 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1557,6 +1557,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode,
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
info->flags = flags & VM_NORESERVE;
+ if (flags & VM_ATOMIC_COPY)
+ inode->i_flags |= S_ATOMIC_COPY;
INIT_LIST_HEAD(&info->swaplist);
switch (mode & S_IFMT) {
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/Makefile | 4 +
kernel/power/tuxonice_netlink.c | 339 +++++++++++++++++++++++++++++++++++++++
2 files changed, 343 insertions(+), 0 deletions(-)
create mode 100644 kernel/power/tuxonice_netlink.c
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 07efc8a..180b89a 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -10,6 +10,10 @@ tuxonice_core-objs := tuxonice_modules.o tuxonice_sysfs.o tuxonice_highlevel.o \
obj-$(CONFIG_TOI) += tuxonice_builtin.o
+ifdef CONFIG_NET
+tuxonice_core-objs += tuxonice_netlink.o
+endif
+
obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
obj-$(CONFIG_PM) += main.o
diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
new file mode 100644
index 0000000..c394f79
--- /dev/null
+++ b/kernel/power/tuxonice_netlink.c
@@ -0,0 +1,339 @@
+/*
+ * kernel/power/tuxonice_netlink.c
+ *
+ * Copyright (C) 2004-2008 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Functions for communicating with a userspace helper via netlink.
+ */
+
+
+#include <linux/suspend.h>
+#include "tuxonice_netlink.h"
+#include "tuxonice.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_alloc.h"
+
+static struct user_helper_data *uhd_list;
+
+/*
+ * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
+ * none can be allocated).
+ */
+static void toi_fill_skb_pool(struct user_helper_data *uhd)
+{
+ while (uhd->pool_level < uhd->pool_limit) {
+ struct sk_buff *new_skb =
+ alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
+
+ if (!new_skb)
+ break;
+
+ new_skb->next = uhd->emerg_skbs;
+ uhd->emerg_skbs = new_skb;
+ uhd->pool_level++;
+ }
+}
+
+/*
+ * Try to allocate a single skb. If we can't get one, try to use one from
+ * our pool.
+ */
+static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
+{
+ struct sk_buff *skb =
+ alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
+
+ if (skb)
+ return skb;
+
+ skb = uhd->emerg_skbs;
+ if (skb) {
+ uhd->pool_level--;
+ uhd->emerg_skbs = skb->next;
+ skb->next = NULL;
+ }
+
+ return skb;
+}
+
+static void put_skb(struct user_helper_data *uhd, struct sk_buff *skb)
+{
+ if (uhd->pool_level < uhd->pool_limit) {
+ skb->next = uhd->emerg_skbs;
+ uhd->emerg_skbs = skb;
+ } else
+ kfree_skb(skb);
+}
+
+void toi_send_netlink_message(struct user_helper_data *uhd,
+ int type, void *params, size_t len)
+{
+ struct sk_buff *skb;
+ struct nlmsghdr *nlh;
+ void *dest;
+ struct task_struct *t;
+
+ if (uhd->pid == -1)
+ return;
+
+ if (uhd->debug)
+ printk(KERN_ERR "toi_send_netlink_message: Send "
+ "message type %d.\n", type);
+
+ skb = toi_get_skb(uhd);
+ if (!skb) {
+ printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
+ return;
+ }
+
+ /* NLMSG_PUT contains a hidden goto nlmsg_failure */
+ nlh = NLMSG_PUT(skb, 0, uhd->sock_seq, type, len);
+ uhd->sock_seq++;
+
+ dest = NLMSG_DATA(nlh);
+ if (params && len > 0)
+ memcpy(dest, params, len);
+
+ netlink_unicast(uhd->nl, skb, uhd->pid, 0);
+
+ read_lock(&tasklist_lock);
+ t = find_task_by_pid_type_ns(PIDTYPE_PID, uhd->pid, &init_pid_ns);
+ if (!t) {
+ read_unlock(&tasklist_lock);
+ if (uhd->pid > -1)
+ printk(KERN_INFO "Hmm. Can't find the userspace task"
+ " %d.\n", uhd->pid);
+ return;
+ }
+ wake_up_process(t);
+ read_unlock(&tasklist_lock);
+
+ yield();
+
+ return;
+
+nlmsg_failure:
+ if (skb)
+ put_skb(uhd, skb);
+
+ if (uhd->debug)
+ printk(KERN_ERR "toi_send_netlink_message: Failed to send "
+ "message type %d.\n", type);
+}
+
+static void send_whether_debugging(struct user_helper_data *uhd)
+{
+ static u8 is_debugging = 1;
+
+ toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
+ &is_debugging, sizeof(u8));
+}
+
+/*
+ * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
+ * are hibernating.
+ */
+static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
+{
+ struct task_struct *t;
+
+ if (uhd->debug)
+ printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
+
+ read_lock(&tasklist_lock);
+ t = find_task_by_pid_type_ns(PIDTYPE_PID, pid, &init_pid_ns);
+ if (!t) {
+ read_unlock(&tasklist_lock);
+ printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
+ pid);
+ return -EINVAL;
+ }
+
+ t->flags |= PF_NOFREEZE;
+
+ read_unlock(&tasklist_lock);
+ uhd->pid = pid;
+
+ toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
+
+ return 0;
+}
+
+/*
+ * Called when the userspace process has informed us that it's ready to roll.
+ */
+static int nl_ready(struct user_helper_data *uhd, u32 version)
+{
+ if (version != uhd->interface_version) {
+ printk(KERN_INFO "%s userspace process using invalid interface"
+ " version (%d - kernel wants %d). Trying to "
+ "continue without it.\n",
+ uhd->name, version, uhd->interface_version);
+ if (uhd->not_ready)
+ uhd->not_ready();
+ return -EINVAL;
+ }
+
+ complete(&uhd->wait_for_process);
+
+ return 0;
+}
+
+void toi_netlink_close_complete(struct user_helper_data *uhd)
+{
+ if (uhd->nl) {
+ netlink_kernel_release(uhd->nl);
+ uhd->nl = NULL;
+ }
+
+ while (uhd->emerg_skbs) {
+ struct sk_buff *next = uhd->emerg_skbs->next;
+ kfree_skb(uhd->emerg_skbs);
+ uhd->emerg_skbs = next;
+ }
+
+ uhd->pid = -1;
+}
+
+static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
+ struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int type = nlh->nlmsg_type;
+ int *data;
+ int err;
+
+ if (uhd->debug)
+ printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
+ type);
+
+ /* Let the more specific handler go first. It returns
+ * 1 for valid messages that it doesn't know. */
+ err = uhd->rcv_msg(skb, nlh);
+ if (err != 1)
+ return err;
+
+ /* Only allow one task to receive NOFREEZE privileges */
+ if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
+ printk(KERN_INFO "Received extra nofreeze me requests.\n");
+ return -EBUSY;
+ }
+
+ data = NLMSG_DATA(nlh);
+
+ switch (type) {
+ case NETLINK_MSG_NOFREEZE_ME:
+ return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
+ case NETLINK_MSG_GET_DEBUGGING:
+ send_whether_debugging(uhd);
+ return 0;
+ case NETLINK_MSG_READY:
+ if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
+ printk(KERN_INFO "Invalid ready mesage.\n");
+ if (uhd->not_ready)
+ uhd->not_ready();
+ return -EINVAL;
+ }
+ return nl_ready(uhd, (u32) *data);
+ case NETLINK_MSG_CLEANUP:
+ toi_netlink_close_complete(uhd);
+ return 0;
+ }
+
+ return -EINVAL;
+}
+
+static void toi_user_rcv_skb(struct sk_buff *skb)
+{
+ int err;
+ struct nlmsghdr *nlh;
+ struct user_helper_data *uhd = uhd_list;
+
+ while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
+ uhd = uhd->next;
+
+ if (!uhd)
+ return;
+
+ while (skb->len >= NLMSG_SPACE(0)) {
+ u32 rlen;
+
+ nlh = (struct nlmsghdr *) skb->data;
+ if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
+ return;
+
+ rlen = NLMSG_ALIGN(nlh->nlmsg_len);
+ if (rlen > skb->len)
+ rlen = skb->len;
+
+ err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
+ if (err)
+ netlink_ack(skb, nlh, err);
+ else if (nlh->nlmsg_flags & NLM_F_ACK)
+ netlink_ack(skb, nlh, 0);
+ skb_pull(skb, rlen);
+ }
+}
+
+static int netlink_prepare(struct user_helper_data *uhd)
+{
+ uhd->next = uhd_list;
+ uhd_list = uhd;
+
+ uhd->sock_seq = 0x42c0ffee;
+ uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, 0,
+ toi_user_rcv_skb, NULL, THIS_MODULE);
+ if (!uhd->nl) {
+ printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
+ uhd->name);
+ return -ENOMEM;
+ }
+
+ toi_fill_skb_pool(uhd);
+
+ return 0;
+}
+
+void toi_netlink_close(struct user_helper_data *uhd)
+{
+ struct task_struct *t;
+
+ read_lock(&tasklist_lock);
+ t = find_task_by_pid_type_ns(PIDTYPE_PID, uhd->pid, &init_pid_ns);
+ if (t)
+ t->flags &= ~PF_NOFREEZE;
+ read_unlock(&tasklist_lock);
+
+ toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
+}
+
+int toi_netlink_setup(struct user_helper_data *uhd)
+{
+ /* In case userui didn't cleanup properly on us */
+ toi_netlink_close_complete(uhd);
+
+ if (netlink_prepare(uhd) < 0) {
+ printk(KERN_INFO "Netlink prepare failed.\n");
+ return 1;
+ }
+
+ if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
+ UMH_WAIT_EXEC, uhd->debug) < 0) {
+ printk(KERN_INFO "Launch userspace program failed.\n");
+ toi_netlink_close_complete(uhd);
+ return 1;
+ }
+
+ /* Wait 2 seconds for the userspace process to make contact */
+ wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
+
+ if (uhd->pid == -1) {
+ printk(KERN_INFO "%s: Failed to contact userspace process.\n",
+ uhd->name);
+ toi_netlink_close_complete(uhd);
+ return 1;
+ }
+
+ return 0;
+}
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
fs/drop_caches.c | 2 +-
include/linux/mm.h | 1 +
2 files changed, 2 insertions(+), 1 deletions(-)
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index b6a719a..e2f2799 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -33,7 +33,7 @@ static void drop_pagecache_sb(struct super_block *sb)
iput(toput_inode);
}
-static void drop_pagecache(void)
+void drop_pagecache(void)
{
struct super_block *sb;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3daeccb..c4199cd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1298,6 +1298,7 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, struct file *,
void __user *, size_t *, loff_t *);
unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
unsigned long lru_pages);
+void drop_pagecache(void);
#ifndef CONFIG_MMU
#define randomize_va_space 0
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
include/linux/freezer.h | 16 ++++++++++++++++
include/linux/fs.h | 3 +++
kernel/power/process.c | 12 ++++++++++++
3 files changed, 31 insertions(+), 0 deletions(-)
diff --git a/include/linux/freezer.h b/include/linux/freezer.h
index 4f50655..c775cd1 100644
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -121,6 +121,21 @@ static inline void set_freezable(void)
current->flags &= ~PF_NOFREEZE;
}
+#ifdef CONFIG_PM_SLEEP
+extern int freezer_state;
+#define FREEZER_OFF 0
+#define FREEZER_FILESYSTEMS_FROZEN 1
+#define FREEZER_USERSPACE_FROZEN 2
+#define FREEZER_FULLY_ON 3
+
+static inline int freezer_is_on(void)
+{
+ return freezer_state == FREEZER_FULLY_ON;
+}
+#else
+static inline int freezer_is_on(void) { return 0; }
+#endif
+
extern void thaw_kernel_threads(void);
/*
@@ -174,6 +189,7 @@ static inline int freeze_processes(void) { BUG(); return 0; }
static inline void thaw_processes(void) {}
static inline int try_to_freeze(void) { return 0; }
+static inline int freezer_is_on(void) { return 0; }
static inline void thaw_kernel_threads(void) { }
static inline void freezer_do_not_count(void) {}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 344b1ca..a11406d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1950,6 +1950,9 @@ extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
extern int fsync_bdev(struct block_device *);
extern int fsync_super(struct super_block *);
extern int fsync_no_super(struct block_device *);
+#define FS_FREEZER_FUSE 1
+#define FS_FREEZER_NORMAL 2
+#define FS_FREEZER_ALL (FS_FREEZER_FUSE | FS_FREEZER_NORMAL)
#else
static inline void bd_forget(struct inode *inode) {}
static inline int sync_blockdev(struct block_device *bdev) { return 0; }
diff --git a/kernel/power/process.c b/kernel/power/process.c
index d464ef7..b2c7fe1 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -14,6 +14,8 @@
#include <linux/syscalls.h>
#include <linux/freezer.h>
+int freezer_state;
+
/*
* Timeout for stopping processes
*/
@@ -145,8 +147,17 @@ static void thaw_tasks(bool nosig_only)
void thaw_processes(void)
{
+ int old_state = freezer_state;
+
+ if (old_state == FREEZER_OFF)
+ return;
+
printk("Restarting tasks ... ");
thaw_tasks(true);
+ freezer_state = FREEZER_OFF;
+
+ if (old_state == FREEZER_FULLY_ON)
+ thaw_tasks(true);
thaw_tasks(false);
schedule();
printk("done.\n");
@@ -154,5 +165,6 @@ void thaw_processes(void)
void thaw_kernel_threads(void)
{
+ freezer_state = FREEZER_USERSPACE_FROZEN;
thaw_tasks(true);
echo disk > /sys/power/state
together with a compile time option to control the default and a sysfs
entry in TuxOnIce's directories to toggle it at run time. Since at
resume time we don't know whether we wrote a TuxOnIce image, always
check for TuxOnIce images when checking for swsusp images (TuxOnIce
recognises swsusp signatures and won't complain about them).
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/Kconfig | 14 ++++++++++++++
kernel/power/disk.c | 10 ++++++++++
kernel/power/tuxonice.h | 1 +
kernel/power/tuxonice_builtin.c | 3 +++
kernel/power/tuxonice_highlevel.c | 2 ++
5 files changed, 30 insertions(+), 0 deletions(-)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index f669cdd..dfe0328 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -259,6 +259,20 @@ menuconfig TOI_CORE
bit. Keep image mode is a little less user friendly on purpose - it
should not be used without thought!
+ config TOI_REPLACE_SWSUSP
+ bool "Replace swsusp by default"
+ default y
+ depends on TOI_CORE
+ ---help---
+ TuxOnIce can replace swsusp. This option makes that the default state,
+ requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
+ to use the vanilla kernel functionality. Note that your initrd/ramfs will
+ need to do this before trying to resume, too.
+ With overriding swsusp enabled, echoing disk to /sys/power/state will
+ start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
+ the swap and file allocators are compiled in, the swap allocator will be
+ used by default.
+
config TOI_IGNORE_LATE_INITCALL
bool "Wait for initrd/ramfs to run, by default"
default n
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 9a1174c..510730a 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -556,6 +556,9 @@ int hibernate(void)
{
int error;
+ if (test_action_state(TOI_REPLACE_SWSUSP))
+ return try_tuxonice_hibernate();
+
mutex_lock(&pm_mutex);
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -637,6 +640,13 @@ int software_resume(void)
{
int error;
unsigned int flags;
+ resume_attempted = 1;
+
+ /*
+ * We can't know (until an image header - if any - is loaded), whether
+ * we did override swsusp. We therefore ensure that both are tried.
+ */
+ try_tuxonice_resume();
/*
* If the user said "noresume".. bail out early.
diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
index 2262973..1d6349d 100644
--- a/kernel/power/tuxonice.h
+++ b/kernel/power/tuxonice.h
@@ -55,6 +55,7 @@ enum {
TOI_TEST_BIO,
TOI_NO_PAGESET2,
TOI_IGNORE_ROOTFS,
+ TOI_REPLACE_SWSUSP,
TOI_PAGESET2_FULL,
TOI_ABORT_ON_RESAVE_NEEDED,
TOI_NO_MULTITHREADED_IO,
diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
index dab9de2..da7680e 100644
--- a/kernel/power/tuxonice_builtin.c
+++ b/kernel/power/tuxonice_builtin.c
@@ -165,6 +165,9 @@ struct toi_boot_kernel_data toi_bkd __nosavedata
__attribute__((aligned(PAGE_SIZE))) = {
MY_BOOT_KERNEL_DATA_VERSION,
0,
+#ifdef CONFIG_TOI_REPLACE_SWSUSP
+ (1 << TOI_REPLACE_SWSUSP) |
+#endif
(1 << TOI_NO_FLUSHER_THREAD) |
(1 << TOI_PAGESET2_FULL) | (1 << TOI_LATE_CPU_HOTPLUG),
};
diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
index 06f97e7..9bd5b34 100644
--- a/kernel/power/tuxonice_highlevel.c
+++ b/kernel/power/tuxonice_highlevel.c
@@ -1190,6 +1190,8 @@ static struct toi_sysfs_data sysfs_params[] = {
SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
TOI_PAGESET2_FULL, 0),
SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
+ SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_REPLACE_SWSUSP, 0),
SYSFS_STRING("resume_commandline", SYSFS_RW,
toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
NULL),
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/disk.c | 20 +++++-----
kernel/power/power.h | 87 +++++++++++++++++++++++++++++++++++++++++++++++
kernel/power/snapshot.c | 69 +++++++------------------------------
3 files changed, 110 insertions(+), 66 deletions(-)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e71ca9c..e4b1166 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -29,7 +29,7 @@
static int noresume = 0;
-static char resume_file[256] = CONFIG_PM_STD_PARTITION;
+char resume_file[256] = CONFIG_PM_STD_PARTITION;
dev_t swsusp_resume_device;
sector_t swsusp_resume_block;
@@ -115,7 +115,7 @@ static int hibernation_test(int level) { return 0; }
* hibernation
*/
-static int platform_begin(int platform_mode)
+int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->begin() : 0;
@@ -126,7 +126,7 @@ static int platform_begin(int platform_mode)
* working state
*/
-static void platform_end(int platform_mode)
+void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->end();
@@ -137,7 +137,7 @@ static void platform_end(int platform_mode)
* platform driver if so configured and return an error code if it fails
*/
-static int platform_pre_snapshot(int platform_mode)
+int platform_pre_snapshot(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_snapshot() : 0;
@@ -148,7 +148,7 @@ static int platform_pre_snapshot(int platform_mode)
* of operation using the platform driver (called with interrupts disabled)
*/
-static void platform_leave(int platform_mode)
+void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->leave();
@@ -159,7 +159,7 @@ static void platform_leave(int platform_mode)
* using the platform driver (must be called after platform_prepare())
*/
-static void platform_finish(int platform_mode)
+void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->finish();
@@ -171,7 +171,7 @@ static void platform_finish(int platform_mode)
* called, platform_restore_cleanup() must be called.
*/
-static int platform_pre_restore(int platform_mode)
+int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
hibernation_ops->pre_restore() : 0;
@@ -184,7 +184,7 @@ static int platform_pre_restore(int platform_mode)
* regardless of the result of platform_pre_restore().
*/
-static void platform_restore_cleanup(int platform_mode)
+void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
hibernation_ops->restore_cleanup();
@@ -195,7 +195,7 @@ static void platform_restore_cleanup(int platform_mode)
* devices.
*/
-static void platform_recover(int platform_mode)
+void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
hibernation_ops->recover();
@@ -634,7 +634,7 @@ int hibernate(void)
*
*/
-static int software_resume(void)
+int software_resume(void)
{
int error;
unsigned int flags;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 46b5ec7..ce81df1 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -31,8 +31,12 @@ static inline char *check_image_kernel(struct swsusp_info *info)
return arch_hibernation_header_restore(info) ?
"architecture specific data" : NULL;
}
+#else
+extern char *check_image_kernel(struct swsusp_info *info);
#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+extern int init_header(struct swsusp_info *info);
+extern char resume_file[256];
/*
* Keep some memory free so that I/O operations can succeed without paging
* [Might this be more than 4 MB?]
@@ -49,6 +53,7 @@ static inline char *check_image_kernel(struct swsusp_info *info)
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
+extern void platform_recover(int platform_mode);
#endif
extern int pfn_is_nosave(unsigned long);
@@ -63,6 +68,8 @@ static struct kobj_attribute _name##_attr = { \
.store = _name##_store, \
}
+extern struct pbe *restore_pblist;
+
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
extern int in_suspend;
@@ -223,3 +230,83 @@ static inline void suspend_thaw_processes(void)
{
}
#endif
+
+extern struct page *saveable_page(struct zone *z, unsigned long p);
+#ifdef CONFIG_HIGHMEM
+extern struct page *saveable_highmem_page(struct zone *z, unsigned long p);
+#else
+static
+inline struct page *saveable_highmem_page(struct zone *z, unsigned long p)
+{
+ return NULL;
+}
+#endif
+
+#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
+extern struct list_head nosave_regions;
+
+/**
+ * This structure represents a range of page frames the contents of which
+ * should not be saved during the suspend.
+ */
+
+struct nosave_region {
+ struct list_head list;
+ unsigned long start_pfn;
+ unsigned long end_pfn;
+};
+
+#ifndef PHYS_PFN_OFFSET
+#define PHYS_PFN_OFFSET 0
+#endif
+
+#define ZONE_START(thiszone) ((thiszone)->zone_start_pfn - PHYS_PFN_OFFSET)
+
+#define BM_END_OF_MAP (~0UL)
+
+#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
+
+struct bm_block {
+ struct list_head hook; /* hook into a list of bitmap blocks */
+ unsigned long start_pfn; /* pfn represented by the first bit */
+ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
+ unsigned long *data; /* bitmap representing pages */
+};
+
+/* struct bm_position is used for browsing memory bitmaps */
+
+struct bm_position {
+ struct bm_block *block;
+ int bit;
+};
+
+struct memory_bitmap {
+ struct list_head blocks; /* list of bitmap blocks */
+ struct linked_page *p_list; /* list of pages used to store zone
+ * bitmap objects and bitmap block
+ * objects
+ */
+ struct bm_position cur; /* most recently used bit position */
+};
+
+extern int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
+ int safe_needed);
+extern void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
+extern void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn);
+extern void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn);
+extern int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn);
+extern unsigned long memory_bm_next_pfn(struct memory_bitmap *bm);
+extern void memory_bm_position_reset(struct memory_bitmap *bm);
+extern void memory_bm_clear(struct memory_bitmap *bm);
+extern void memory_bm_copy(struct memory_bitmap *source,
+ struct memory_bitmap *dest);
+extern void memory_bm_dup(struct memory_bitmap *source,
+ struct memory_bitmap *dest);
+
+#ifdef CONFIG_TOI
+struct toi_module_ops;
+extern int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
+ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
+extern int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
+ (int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
+#endif
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 33e2e4a..8020644 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -223,48 +223,19 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
* the represented memory area.
*/
-#define BM_END_OF_MAP (~0UL)
-
-#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
-
-struct bm_block {
- struct list_head hook; /* hook into a list of bitmap blocks */
- unsigned long start_pfn; /* pfn represented by the first bit */
- unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
- unsigned long *data; /* bitmap representing pages */
-};
-
static inline unsigned long bm_block_bits(struct bm_block *bb)
{
return bb->end_pfn - bb->start_pfn;
}
-/* strcut bm_position is used for browsing memory bitmaps */
-
-struct bm_position {
- struct bm_block *block;
- int bit;
-};
-
-struct memory_bitmap {
- struct list_head blocks; /* list of bitmap blocks */
- struct linked_page *p_list; /* list of pages used to store zone
- * bitmap objects and bitmap block
- * objects
- */
- struct bm_position cur; /* most recently used bit position */
-};
-
/* Functions that operate on memory bitmaps */
-static void memory_bm_position_reset(struct memory_bitmap *bm)
+void memory_bm_position_reset(struct memory_bitmap *bm)
{
bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
bm->cur.bit = 0;
}
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
-
/**
* create_bm_block_list - create a list of block bitmap objects
* @nr_blocks - number of blocks to allocate
@@ -371,7 +342,7 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
/**
* memory_bm_create - allocate memory for a memory bitmap
*/
-static int
+int
memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
{
struct chain_allocator ca;
@@ -431,7 +402,7 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
/**
* memory_bm_free - free memory occupied by the memory bitmap @bm
*/
-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
{
struct bm_block *bb;
@@ -481,7 +452,7 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
return 0;
}
-static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
@@ -504,7 +475,7 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
return error;
}
-static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
+void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
@@ -515,7 +486,7 @@ static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
clear_bit(bit, addr);
}
-static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
+int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
{
void *addr;
unsigned int bit;
@@ -543,7 +514,7 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
* this function.
*/
-static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
+unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
{
struct bm_block *bb;
int bit;
@@ -568,18 +539,9 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
return bb->start_pfn + bit;
}
-/**
- * This structure represents a range of page frames the contents of which
- * should not be saved during the suspend.
- */
-struct nosave_region {
- struct list_head list;
- unsigned long start_pfn;
- unsigned long end_pfn;
-};
-static LIST_HEAD(nosave_regions);
+LIST_HEAD(nosave_regions);
/**
* register_nosave_region - register a range of page frames the contents
@@ -815,7 +777,7 @@ static unsigned int count_free_highmem_pages(void)
* We should save the page if it isn't Nosave or NosaveFree, or Reserved,
* and it isn't a part of a free chunk of pages.
*/
-static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -859,11 +821,6 @@ unsigned int count_highmem_pages(void)
}
return n;
}
-#else
-static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
-{
- return NULL;
-}
#endif /* CONFIG_HIGHMEM */
/**
@@ -874,7 +831,7 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
* of pages statically defined as 'unsaveable', and it isn't a part of
* a free chunk of pages.
*/
-static struct page *saveable_page(struct zone *zone, unsigned long pfn)
+struct page *saveable_page(struct zone *zone, unsigned long pfn)
{
struct page *page;
@@ -1250,14 +1207,14 @@ asmlinkage int swsusp_save(void)
}
#ifndef CONFIG_ARCH_HIBERNATION_HEADER
-static int init_header_complete(struct swsusp_info *info)
+int init_header_complete(struct swsusp_info *info)
{
memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
info->version_code = LINUX_VERSION_CODE;
return 0;
}
-static char *check_image_kernel(struct swsusp_info *info)
+char *check_image_kernel(struct swsusp_info *info)
{
if (info->version_code != LINUX_VERSION_CODE)
return "kernel version";
@@ -1278,7 +1235,7 @@ unsigned long snapshot_get_image_size(void)
return nr_copy_pages + nr_meta_pages + 1;
}
-static int init_header(struct swsusp_info *info)
+int init_header(struct swsusp_info *info)
{
memset(info, 0, sizeof(struct swsusp_info));
info->num_physpages = num_physpages;
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/tuxonice_block_io.c | 1319 ++++++++++++++++++++++++++++++++++++++
kernel/power/tuxonice_block_io.h | 59 ++
2 files changed, 1378 insertions(+), 0 deletions(-)
create mode 100644 kernel/power/tuxonice_block_io.c
create mode 100644 kernel/power/tuxonice_block_io.h
diff --git a/kernel/power/tuxonice_block_io.c b/kernel/power/tuxonice_block_io.c
new file mode 100644
index 0000000..c1936f4
--- /dev/null
+++ b/kernel/power/tuxonice_block_io.c
@@ -0,0 +1,1319 @@
+/*
+ * kernel/power/tuxonice_block_io.c
+ *
+ * Copyright (C) 2004-2008 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains block io functions for TuxOnIce. These are
+ * used by the swapwriter and it is planned that they will also
+ * be used by the NFSwriter.
+ *
+ */
+
+#include <linux/blkdev.h>
+#include <linux/syscalls.h>
+#include <linux/suspend.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_prepare_image.h"
+#include "tuxonice_block_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_io.h"
+
+#define MEMORY_ONLY 1
+#define THROTTLE_WAIT 2
+
+/* #define MEASURE_MUTEX_CONTENTION */
+#ifndef MEASURE_MUTEX_CONTENTION
+#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
+#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
+#else
+unsigned long mutex_times[2][2][NR_CPUS];
+#define my_mutex_lock(index, the_lock) do { \
+ int have_mutex; \
+ have_mutex = mutex_trylock(the_lock); \
+ if (!have_mutex) { \
+ mutex_lock(the_lock); \
+ mutex_times[index][0][smp_processor_id()]++; \
+ } else { \
+ mutex_times[index][1][smp_processor_id()]++; \
+ }
+
+#define my_mutex_unlock(index, the_lock) \
+ mutex_unlock(the_lock); \
+} while (0)
+#endif
+
+static int target_outstanding_io = 1024;
+static int max_outstanding_writes, max_outstanding_reads;
+
+static struct page *bio_queue_head, *bio_queue_tail;
+static atomic_t toi_bio_queue_size;
+static DEFINE_SPINLOCK(bio_queue_lock);
+
+static int free_mem_throttle, throughput_throttle;
+static int more_readahead = 1;
+static struct page *readahead_list_head, *readahead_list_tail;
+static DECLARE_WAIT_QUEUE_HEAD(readahead_list_wait);
+
+static struct page *waiting_on;
+
+static atomic_t toi_io_in_progress, toi_io_done;
+static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
+
+static int extra_page_forward;
+
+static int current_stream;
+/* 0 = Header, 1 = Pageset1, 2 = Pageset2, 3 = End of PS1 */
+struct hibernate_extent_iterate_saved_state toi_writer_posn_save[4];
+
+/* Pointer to current entry being loaded/saved. */
+struct toi_extent_iterate_state toi_writer_posn;
+
+/* Not static, so that the allocators can setup and complete
+ * writing the header */
+char *toi_writer_buffer;
+int toi_writer_buffer_posn;
+
+static struct toi_bdev_info *toi_devinfo;
+
+static DEFINE_MUTEX(toi_bio_mutex);
+static DEFINE_MUTEX(toi_bio_readahead_mutex);
+
+static struct task_struct *toi_queue_flusher;
+static int toi_bio_queue_flush_pages(int dedicated_thread);
+
+#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
+ atomic_read(&toi_bio_queue_size))
+
+/**
+ * set_free_mem_throttle - set the point where we pause to avoid oom.
+ *
+ * Initially, this value is zero, but when we first fail to allocate memory,
+ * we set it (plus a buffer) and thereafter throttle i/o once that limit is
+ * reached.
+ **/
+static void set_free_mem_throttle(void)
+{
+ int new_throttle = nr_unallocated_buffer_pages() + 256;
+
+ if (new_throttle > free_mem_throttle)
+ free_mem_throttle = new_throttle;
+}
+
+#define NUM_REASONS 7
+static atomic_t reasons[NUM_REASONS];
+static char *reason_name[NUM_REASONS] = {
+ "readahead not ready",
+ "bio allocation",
+ "synchronous I/O",
+ "toi_bio_get_new_page",
+ "memory low",
+ "readahead buffer allocation",
+ "throughput_throttle",
+};
+
+/**
+ * do_bio_wait - wait for some TuxOnIce I/O to complete
+ * @reason: The array index of the reason we're waiting.
+ *
+ * Wait for a particular page of I/O if we're after a particular page.
+ * If we're not after a particular page, wait instead for all in flight
+ * I/O to be completed or for us to have enough free memory to be able
+ * to submit more I/O.
+ *
+ * If we wait, we also update our statistics regarding why we waited.
+ **/
+static void do_bio_wait(int reason)
+{
+ struct page *was_waiting_on = waiting_on;
+
+ /* On SMP, waiting_on can be reset, so we make a copy */
+ if (was_waiting_on) {
+ if (PageLocked(was_waiting_on)) {
+ wait_on_page_bit(was_waiting_on, PG_locked);
+ atomic_inc(&reasons[reason]);
+ }
+ } else {
+ atomic_inc(&reasons[reason]);
+
+ wait_event(num_in_progress_wait,
+ !atomic_read(&toi_io_in_progress) ||
+ nr_unallocated_buffer_pages() > free_mem_throttle);
+ }
+}
+
+/**
+ * throttle_if_needed - wait for I/O completion if throttle points are reached
+ * @flags: What to check and how to act.
+ *
+ * Check whether we need to wait for some I/O to complete. We always check
+ * whether we have enough memory available, but may also (depending upon
+ * @reason) check if the throughput throttle limit has been reached.
+ **/
+static int throttle_if_needed(int flags)
+{
+ int free_pages = nr_unallocated_buffer_pages();
+
+ /* Getting low on memory and I/O is in progress? */
+ while (unlikely(free_pages < free_mem_throttle) &&
+ atomic_read(&toi_io_in_progress)) {
+ if (!(flags & THROTTLE_WAIT))
+ return -ENOMEM;
+ do_bio_wait(4);
+ free_pages = nr_unallocated_buffer_pages();
+ }
+
+ while (!(flags & MEMORY_ONLY) && throughput_throttle &&
+ TOTAL_OUTSTANDING_IO >= throughput_throttle) {
+ int result = toi_bio_queue_flush_pages(0);
+ if (result)
+ return result;
+ atomic_inc(&reasons[6]);
+ wait_event(num_in_progress_wait,
+ !atomic_read(&toi_io_in_progress) ||
+ TOTAL_OUTSTANDING_IO < throughput_throttle);
+ }
+
+ return 0;
+}
+
+/**
+ * update_throughput_throttle - update the raw throughput throttle
+ * @jif_index: The number of times this function has been called.
+ *
+ * This function is called twice per second by the core, and used to limit the
+ * amount of I/O we submit at once, spreading out our waiting through the
+ * whole job and letting userui get an opportunity to do its work.
+ *
+ * We don't start limiting I/O until 1/2s has gone so that we get a
+ * decent sample for our initial limit, and keep updating it because
+ * throughput may vary (on rotating media, eg) with our block number.
+ *
+ * We throttle to 1/10s worth of I/O.
+ **/
+static void update_throughput_throttle(int jif_index)
+{
+ int done = atomic_read(&toi_io_done);
+ throughput_throttle = done / jif_index / 5;
+}
+
+/**
+ * toi_finish_all_io - wait for all outstanding i/o to complete
+ *
+ * Flush any queued but unsubmitted I/O and wait for it all to complete.
+ **/
+static int toi_finish_all_io(void)
+{
+ int result = toi_bio_queue_flush_pages(0);
+ wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
+ return result;
+}
+
+/**
+ * toi_end_bio - bio completion function.
+ * @bio: bio that has completed.
+ * @err: Error value. Yes, like end_swap_bio_read, we ignore it.
+ *
+ * Function called by the block driver from interrupt context when I/O is
+ * completed. If we were writing the page, we want to free it and will have
+ * set bio->bi_private to the parameter we should use in telling the page
+ * allocation accounting code what the page was allocated for. If we're
+ * reading the page, it will be in the singly linked list made from
+ * page->private pointers.
+ **/
+static void toi_end_bio(struct bio *bio, int err)
+{
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+
+ unlock_page(page);
+ bio_put(bio);
+
+ if (waiting_on == page)
+ waiting_on = NULL;
+
+ put_page(page);
+
+ if (bio->bi_private)
+ toi__free_page((int) ((unsigned long) bio->bi_private) , page);
+
+ bio_put(bio);
+
+ atomic_dec(&toi_io_in_progress);
+ atomic_inc(&toi_io_done);
+
+ wake_up(&num_in_progress_wait);
+}
+
+/**
+ * submit - submit BIO request
+ * @writing: READ or WRITE.
+ * @dev: The block device we're using.
+ * @first_block: The first sector we're using.
+ * @page: The page being used for I/O.
+ * @free_group: If writing, the group that was used in allocating the page
+ * and which will be used in freeing the page from the completion
+ * routine.
+ *
+ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
+ * textbook - allocate and initialize the bio. If we're writing, make sure
+ * the page is marked as dirty. Then submit it and carry on."
+ *
+ * If we're just testing the speed of our own code, we fake having done all
+ * the hard work and all toi_end_bio immediately.
+ **/
+static int submit(int writing, struct block_device *dev, sector_t first_block,
+ struct page *page, int free_group)
+{
+ struct bio *bio = NULL;
+ int cur_outstanding_io, result;
+
+ /*
+ * Shouldn't throttle if reading - can deadlock in the single
+ * threaded case as pages are only freed when we use the
+ * readahead.
+ */
+ if (writing) {
+ result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
+ if (result)
+ return result;
+ }
+
+ while (!bio) {
+ bio = bio_alloc(TOI_ATOMIC_GFP, 1);
+ if (!bio) {
+ set_free_mem_throttle();
+ do_bio_wait(1);
+ }
+ }
+
+ bio->bi_bdev = dev;
+ bio->bi_sector = first_block;
+ bio->bi_private = (void *) ((unsigned long) free_group);
+ bio->bi_end_io = toi_end_bio;
+
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
+ printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
+ (unsigned long long) first_block);
+ bio_put(bio);
+ return -EFAULT;
+ }
+
+ bio_get(bio);
+
+ cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
+ if (writing) {
+ if (cur_outstanding_io > max_outstanding_writes)
+ max_outstanding_writes = cur_outstanding_io;
+ } else {
+ if (cur_outstanding_io > max_outstanding_reads)
+ max_outstanding_reads = cur_outstanding_io;
+ }
+
+
+ if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED))) {
+ /* Fake having done the hard work */
+ set_bit(BIO_UPTODATE, &bio->bi_flags);
+ toi_end_bio(bio, 0);
+ } else
+ submit_bio(writing | (1 << BIO_RW_SYNCIO) |
+ (1 << BIO_RW_UNPLUG), bio);
+
+ return 0;
+}
+
+/**
+ * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
+ *
+ * @writing: Whether reading or writing.
+ * @bdev: The block device which we're using.
+ * @block0: The first sector we're reading or writing.
+ * @page: The page on which I/O is being done.
+ * @readahead_index: If doing readahead, the index (reset this flag when done).
+ * @syncio: Whether the i/o is being done synchronously.
+ *
+ * Prepare and start a read or write operation.
+ *
+ * Note that we always work with our own page. If writing, we might be given a
+ * compression buffer that will immediately be used to start compressing the
+ * next page. For reading, we do readahead and therefore don't know the final
+ * address where the data needs to go.
+ **/
+static int toi_do_io(int writing, struct block_device *bdev, long block0,
+ struct page *page, int is_readahead, int syncio, int free_group)
+{
+ page->private = 0;
+
+ /* Do here so we don't race against toi_bio_get_next_page_read */
+ lock_page(page);
+
+ if (is_readahead) {
+ if (readahead_list_head)
+ readahead_list_tail->private = (unsigned long) page;
+ else
+ readahead_list_head = page;
+
+ readahead_list_tail = page;
+ wake_up(&readahead_list_wait);
+ }
+
+ /* Done before submitting to avoid races. */
+ if (syncio)
+ waiting_on = page;
+
+ /* Submit the page */
+ get_page(page);
+
+ if (submit(writing, bdev, block0, page, free_group))
+ return -EFAULT;
+
+ if (syncio)
+ do_bio_wait(2);
+
+ return 0;
+}
+
+/**
+ * toi_bdev_page_io - simpler interface to do directly i/o on a single page
+ * @writing: Whether reading or writing.
+ * @bdev: Block device on which we're operating.
+ * @pos: Sector at which page to read or write starts.
+ * @page: Page to be read/written.
+ *
+ * A simple interface to submit a page of I/O and wait for its completion.
+ * The caller must free the page used.
+ **/
+static int toi_bdev_page_io(int writing, struct block_device *bdev,
+ long pos, struct page *page)
+{
+ return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
+}
+
+/**
+ * toi_bio_memory_needed - report the amount of memory needed for block i/o
+ *
+ * We want to have at least enough memory so as to have target_outstanding_io
+ * or more transactions on the fly at once. If we can do more, fine.
+ **/
+static int toi_bio_memory_needed(void)
+{
+ return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
+ sizeof(struct bio));
+}
+
+/**
+ * toi_bio_print_debug_stats - put out debugging info in the buffer provided
+ * @buffer: A buffer of size @size into which text should be placed.
+ * @size: The size of @buffer.
+ *
+ * Fill a buffer with debugging info. This is used for both our debug_info sysfs
+ * entry and for recording the same info in dmesg.
+ **/
+static int toi_bio_print_debug_stats(char *buffer, int size)
+{
+ int len = scnprintf(buffer, size, "- Max outstanding reads %d. Max "
+ "writes %d.\n", max_outstanding_reads,
+ max_outstanding_writes);
+
+ len += scnprintf(buffer + len, size - len,
+ " Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
+ target_outstanding_io,
+ PAGE_SIZE, (unsigned int) sizeof(struct request),
+ (unsigned int) sizeof(struct bio), toi_bio_memory_needed());
+
+#ifdef MEASURE_MUTEX_CONTENTION
+ {
+ int i;
+
+ len += scnprintf(buffer + len, size - len,
+ " Mutex contention while reading:\n Contended Free\n");
+
+ for_each_online_cpu(i)
+ len += scnprintf(buffer + len, size - len,
+ " %9lu %9lu\n",
+ mutex_times[0][0][i], mutex_times[0][1][i]);
+
+ len += scnprintf(buffer + len, size - len,
+ " Mutex contention while writing:\n Contended Free\n");
+
+ for_each_online_cpu(i)
+ len += scnprintf(buffer + len, size - len,
+ " %9lu %9lu\n",
+ mutex_times[1][0][i], mutex_times[1][1][i]);
+
+ }
+#endif
+
+ return len + scnprintf(buffer + len, size - len,
+ " Free mem throttle point reached %d.\n", free_mem_throttle);
+}
+
+/**
+ * toi_set_devinfo - set the bdev info used for i/o
+ * @info: Pointer to an array of struct toi_bdev_info - the list of
+ * bdevs and blocks on them in which the image is stored.
+ *
+ * Set the list of bdevs and blocks in which the image will be stored.
+ * Think of them (all together) as one long tape on which the data will be
+ * stored.
+ **/
+static void toi_set_devinfo(struct toi_bdev_info *info)
+{
+ toi_devinfo = info;
+}
+
+/**
+ * dump_block_chains - print the contents of the bdev info array.
+ **/
+static void dump_block_chains(void)
+{
+ int i;
+
+ for (i = 0; i < toi_writer_posn.num_chains; i++) {
+ struct hibernate_extent *this;
+
+ this = (toi_writer_posn.chains + i)->first;
+
+ if (!this)
+ continue;
+
+ printk(KERN_DEBUG "Chain %d:", i);
+
+ while (this) {
+ printk(" [%lu-%lu]%s", this->start,
+ this->end, this->next ? "," : "");
+ this = this->next;
+ }
+
+ printk("\n");
+ }
+
+ for (i = 0; i < 4; i++)
+ printk(KERN_DEBUG "Posn %d: Chain %d, extent %d, offset %lu.\n",
+ i, toi_writer_posn_save[i].chain_num,
+ toi_writer_posn_save[i].extent_num,
+ toi_writer_posn_save[i].offset);
+}
+
+static int total_header_bytes;
+static int unowned;
+
+static int debug_broken_header(void)
+{
+ printk(KERN_DEBUG "Image header too big for size allocated!\n");
+ print_toi_header_storage_for_modules();
+ printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
+ printk(KERN_DEBUG "toi_header : %ld.\n", sizeof(struct toi_header));
+ printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
+ printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
+ DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
+ printk(KERN_DEBUG "Space needed now : %ld.\n",
+ get_header_storage_needed());
+ dump_block_chains();
+ abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
+ return -EIO;
+}
+
+/**
+ * go_next_page - skip blocks to the start of the next page
+ * @writing: Whether we're reading or writing the image.
+ *
+ * Go forward one page, or two if extra_page_forward is set. It only gets
+ * set at the start of reading the image header, to skip the first page
+ * of the header, which is read without using the extent chains.
+ **/
+static int go_next_page(int writing, int section_barrier)
+{
+ int i, chain_num = toi_writer_posn.current_chain,
+ max = (chain_num == -1) ? 1 : toi_devinfo[chain_num].blocks_per_page,
+ compare_to = 0, compare_chain, compare_offset;
+
+ /* Have we already used the last page of the stream? */
+ switch (current_stream) {
+ case 0:
+ compare_to = 2;
+ break;
+ case 1:
+ compare_to = 3;
+ break;
+ case 2:
+ compare_to = 1;
+ break;
+ }
+
+ compare_chain = toi_writer_posn_save[compare_to].chain_num;
+ compare_offset = toi_writer_posn_save[compare_to].offset;
+
+ if (section_barrier && chain_num == compare_chain &&
+ toi_writer_posn.current_offset == compare_offset) {
+ if (writing) {
+ if (!current_stream)
+ return debug_broken_header();
+ } else {
+ more_readahead = 0;
+ return -ENODATA;
+ }
+ }
+
+ /* Nope. Go foward a page - or maybe two */
+ for (i = 0; i < max; i++)
+ toi_extent_state_next(&toi_writer_posn);
+
+ if (toi_extent_state_eof(&toi_writer_posn)) {
+ /* Don't complain if readahead falls off the end */
+ if (writing && section_barrier) {
+ printk(KERN_DEBUG "Extent state eof. "
+ "Expected compression ratio too optimistic?\n");
+ dump_block_chains();
+ }
+ return -ENODATA;
+ }
+
+ if (extra_page_forward) {
+ extra_page_forward = 0;
+ return go_next_page(writing, section_barrier);
+ }
+
+ return 0;
+}
+
+/**
+ * set_extra_page_forward - make us skip an extra page on next go_next_page
+ *
+ * Used in reading header, to jump to 2nd page after getting 1st page
+ * direct from image header.
+ **/
+static void set_extra_page_forward(void)
+{
+ extra_page_forward = 1;
+}
+
+/**
+ * toi_bio_rw_page - do i/o on the next disk page in the image
+ * @writing: Whether reading or writing.
+ * @page: Page to do i/o on.
+ * @is_readahead: Whether we're doing readahead
+ * @free_group: The group used in allocating the page
+ *
+ * Submit a page for reading or writing, possibly readahead.
+ * Pass the group used in allocating the page as well, as it should
+ * be freed on completion of the bio if we're writing the page.
+ **/
+static int toi_bio_rw_page(int writing, struct page *page,
+ int is_readahead, int free_group)
+{
+ struct toi_bdev_info *dev_info;
+ int result = go_next_page(writing, 1);
+
+ if (result)
+ return result;
+
+ dev_info = &toi_devinfo[toi_writer_posn.current_chain];
+
+ return toi_do_io(writing, dev_info->bdev,
+ toi_writer_posn.current_offset <<
+ dev_info->bmap_shift,
+ page, is_readahead, 0, free_group);
+}
+
+/**
+ * toi_rw_init - prepare to read or write a stream in the image
+ * @writing: Whether reading or writing.
+ * @stream number: Section of the image being processed.
+ *
+ * Prepare to read or write a section ('stream') in the image.
+ **/
+static int toi_rw_init(int writing, int stream_number)
+{
+ if (stream_number)
+ toi_extent_state_restore(&toi_writer_posn,
+ &toi_writer_posn_save[stream_number]);
+ else
+ toi_extent_state_goto_start(&toi_writer_posn);
+
+ atomic_set(&toi_io_done, 0);
+ toi_writer_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
+ toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
+
+ current_stream = stream_number;
+
+ more_readahead = 1;
+
+ return toi_writer_buffer ? 0 : -ENOMEM;
+}
+
+/**
+ * toi_read_header_init - prepare to read the image header
+ *
+ * Reset readahead indices prior to starting to read a section of the image.
+ **/
+static void toi_read_header_init(void)
+{
+ toi_writer_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
+ more_readahead = 1;
+}
+
+/**
+ * toi_bio_queue_write - queue a page for writing
+ * @full_buffer: Pointer to a page to be queued
+ *
+ * Add a page to the queue to be submitted. If we're the queue flusher,
+ * we'll do this once we've dropped toi_bio_mutex, so other threads can
+ * continue to submit I/O while we're on the slow path doing the actual
+ * submission.
+ **/
+static void toi_bio_queue_write(char **full_buffer)
+{
+ struct page *page = virt_to_page(*full_buffer);
+ unsigned long flags;
+
+ page->private = 0;
+
+ spin_lock_irqsave(&bio_queue_lock, flags);
+ if (!bio_queue_head)
+ bio_queue_head = page;
+ else
+ bio_queue_tail->private = (unsigned long) page;
+
+ bio_queue_tail = page;
+ atomic_inc(&toi_bio_queue_size);
+
+ spin_unlock_irqrestore(&bio_queue_lock, flags);
+ wake_up(&toi_io_queue_flusher);
+
+ *full_buffer = NULL;
+}
+
+/**
+ * toi_rw_cleanup - Cleanup after i/o.
+ * @writing: Whether we were reading or writing.
+ *
+ * Flush all I/O and clean everything up after reading or writing a
+ * section of the image.
+ **/
+static int toi_rw_cleanup(int writing)
+{
+ int i, result;
+
+ if (writing) {
+ int result;
+
+ if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
+ toi_bio_queue_write(&toi_writer_buffer);
+
+ result = toi_bio_queue_flush_pages(0);
+
+ if (result)
+ return result;
+
+ if (current_stream == 2)
+ toi_extent_state_save(&toi_writer_posn,
+ &toi_writer_posn_save[1]);
+ else if (current_stream == 1)
+ toi_extent_state_save(&toi_writer_posn,
+ &toi_writer_posn_save[3]);
+ }
+
+ result = toi_finish_all_io();
+
+ while (readahead_list_head) {
+ void *next = (void *) readahead_list_head->private;
+ toi__free_page(12, readahead_list_head);
+ readahead_list_head = next;
+ }
+
+ readahead_list_tail = NULL;
+
+ if (!current_stream)
+ return result;
+
+ for (i = 0; i < NUM_REASONS; i++) {
+ if (!atomic_read(&reasons[i]))
+ continue;
+ printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
+ reason_name[i], atomic_read(&reasons[i]));
+ atomic_set(&reasons[i], 0);
+ }
+
+ current_stream = 0;
+ return result;
+}
+
+/**
+ * toi_start_one_readahead - start one page of readahead
+ * @dedicated_thread: Is this a thread dedicated to doing readahead?
+ *
+ * Start one new page of readahead. If this is being called by a thread
+ * whose only just is to submit readahead, don't quit because we failed
+ * to allocate a page.
+ **/
+static int toi_start_one_readahead(int dedicated_thread)
+{
+ char *buffer = NULL;
+ int oom = 0, result;
+
+ result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
+ if (result)
+ return result;
+
+ mutex_lock(&toi_bio_readahead_mutex);
+
+ while (!buffer) {
+ buffer = (char *) toi_get_zeroed_page(12,
+ TOI_ATOMIC_GFP);
+ if (!buffer) {
+ if (oom && !dedicated_thread) {
+ mutex_unlock(&toi_bio_readahead_mutex);
+ return -ENOMEM;
+ }
+
+ oom = 1;
+ set_free_mem_throttle();
+ do_bio_wait(5);
+ }
+ }
+
+ result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
+ mutex_unlock(&toi_bio_readahead_mutex);
+ return result;
+}
+
+/**
+ * toi_start_new_readahead - start new readahead
+ * @dedicated_thread: Are we dedicated to this task?
+ *
+ * Start readahead of image pages.
+ *
+ * We can be called as a thread dedicated to this task (may be helpful on
+ * systems with lots of CPUs), in which case we don't exit until there's no
+ * more readahead.
+ *
+ * If this is not called by a dedicated thread, we top up our queue until
+ * there's no more readahead to submit, we've submitted the number given
+ * in target_outstanding_io or the number in progress exceeds the target
+ * outstanding I/O value.
+ *
+ * No mutex needed because this is only ever called by the first cpu.
+ **/
+static int toi_start_new_readahead(int dedicated_thread)
+{
+ int last_result, num_submitted = 0;
+
+ /* Start a new readahead? */
+ if (!more_readahead)
+ return 0;
+
+ do {
+ last_result = toi_start_one_readahead(dedicated_thread);
+
+ if (last_result) {
+ if (last_result == -ENOMEM || last_result == -ENODATA)
+ return 0;
+
+ printk(KERN_DEBUG
+ "Begin read chunk returned %d.\n",
+ last_result);
+ } else
+ num_submitted++;
+
+ } while (more_readahead && !last_result &&
+ (dedicated_thread ||
+ (num_submitted < target_outstanding_io &&
+ atomic_read(&toi_io_in_progress) < target_outstanding_io)));
+
+ return last_result;
+}
+
+/**
+ * bio_io_flusher - start the dedicated I/O flushing routine
+ * @writing: Whether we're writing the image.
+ **/
+static int bio_io_flusher(int writing)
+{
+
+ if (writing)
+ return toi_bio_queue_flush_pages(1);
+ else
+ return toi_start_new_readahead(1);
+}
+
+/**
+ * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
+ * @no_readahead: Whether we can use readahead
+ *
+ * Read a page from disk, submitting readahead and cleaning up finished i/o
+ * while we wait for the page we're after.
+ **/
+static int toi_bio_get_next_page_read(int no_readahead)
+{
+ unsigned long *virt;
+ struct page *next;
+
+ /*
+ * When reading the second page of the header, we have to
+ * delay submitting the read until after we've gotten the
+ * extents out of the first page.
+ */
+ if (unlikely(no_readahead && toi_start_one_readahead(0))) {
+ printk(KERN_DEBUG "No readahead and toi_start_one_readahead "
+ "returned non-zero.\n");
+ return -EIO;
+ }
+
+ if (unlikely(!readahead_list_head)) {
+ BUG_ON(!more_readahead);
+ if (unlikely(toi_start_one_readahead(0))) {
+ printk(KERN_DEBUG "No readahead and "
+ "toi_start_one_readahead returned non-zero.\n");
+ return -EIO;
+ }
+ }
+
+ if (PageLocked(readahead_list_head)) {
+ waiting_on = readahead_list_head;
+ do_bio_wait(0);
+ }
+
+ virt = page_address(readahead_list_head);
+ memcpy(toi_writer_buffer, virt, PAGE_SIZE);
+
+ next = (struct page *) readahead_list_head->private;
+ toi__free_page(12, readahead_list_head);
+ readahead_list_head = next;
+ return 0;
+}
+
+/**
+ * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
+ * @dedicated_thread: Whether we're a dedicated thread
+ *
+ * Flush the queue of pages ready to be written to disk.
+ *
+ * If we're a dedicated thread, stay in here until told to leave,
+ * sleeping in wait_event.
+ *
+ * The first thread is normally the only one to come in here. Another
+ * thread can enter this routine too, though, via throttle_if_needed.
+ * Since that's the case, we must be careful to only have one thread
+ * doing this work at a time. Otherwise we have a race and could save
+ * pages out of order.
+ *
+ * If an error occurs, free all remaining pages without submitting them
+ * for I/O.
+ **/
+
+int toi_bio_queue_flush_pages(int dedicated_thread)
+{
+ unsigned long flags;
+ int result = 0;
+ static int busy;
+
+ if (busy)
+ return 0;
+
+ busy = 1;
+
+top:
+ spin_lock_irqsave(&bio_queue_lock, flags);
+ while (bio_queue_head) {
+ struct page *page = bio_queue_head;
+ bio_queue_head = (struct page *) page->private;
+ if (bio_queue_tail == page)
+ bio_queue_tail = NULL;
+ atomic_dec(&toi_bio_queue_size);
+ spin_unlock_irqrestore(&bio_queue_lock, flags);
+ if (!result)
+ result = toi_bio_rw_page(WRITE, page, 0, 11);
+ if (result)
+ toi__free_page(11 , page);
+ spin_lock_irqsave(&bio_queue_lock, flags);
+ }
+ spin_unlock_irqrestore(&bio_queue_lock, flags);
+
+ if (dedicated_thread) {
+ wait_event(toi_io_queue_flusher, bio_queue_head ||
+ toi_bio_queue_flusher_should_finish);
+ if (likely(!toi_bio_queue_flusher_should_finish))
+ goto top;
+ toi_bio_queue_flusher_should_finish = 0;
+ }
+
+ busy = 0;
+ return result;
+}
+
+/**
+ * toi_bio_get_new_page - get a new page for I/O
+ * @full_buffer: Pointer to a page to allocate.
+ **/
+static int toi_bio_get_new_page(char **full_buffer)
+{
+ int result = throttle_if_needed(THROTTLE_WAIT);
+ if (result)
+ return result;
+
+ while (!*full_buffer) {
+ *full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
+ if (!*full_buffer) {
+ set_free_mem_throttle();
+ do_bio_wait(3);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
+ * @writing: Bool - whether writing (or reading).
+ * @buffer: The start of the buffer to write or fill.
+ * @buffer_size: The size of the buffer to write or fill.
+ * @no_readahead: Don't try to start readhead (when getting extents).
+ **/
+static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
+ int no_readahead)
+{
+ int bytes_left = buffer_size, result = 0;
+
+ while (bytes_left) {
+ char *source_start = buffer + buffer_size - bytes_left;
+ char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
+ int capacity = PAGE_SIZE - toi_writer_buffer_posn;
+ char *to = writing ? dest_start : source_start;
+ char *from = writing ? source_start : dest_start;
+
+ if (bytes_left <= capacity) {
+ memcpy(to, from, bytes_left);
+ toi_writer_buffer_posn += bytes_left;
+ return 0;
+ }
+
+ /* Complete this page and start a new one */
+ memcpy(to, from, capacity);
+ bytes_left -= capacity;
+
+ if (!writing) {
+ /*
+ * Perform actual I/O:
+ * read readahead_list_head into toi_writer_buffer
+ */
+ int result = toi_bio_get_next_page_read(no_readahead);
+ if (result)
+ return result;
+ } else {
+ toi_bio_queue_write(&toi_writer_buffer);
+ result = toi_bio_get_new_page(&toi_writer_buffer);
+ if (result)
+ return result;
+ }
+
+ toi_writer_buffer_posn = 0;
+ toi_cond_pause(0, NULL);
+ }
+
+ return 0;
+}
+
+/**
+ * toi_bio_read_page - read a page of the image
+ * @pfn: The pfn where the data belongs.
+ * @buffer_page: The page containing the (possibly compressed) data.
+ * @buf_size: The number of bytes on @buffer_page used (PAGE_SIZE).
+ *
+ * Read a (possibly compressed) page from the image, into buffer_page,
+ * returning its pfn and the buffer size.
+ **/
+static int toi_bio_read_page(unsigned long *pfn, struct page *buffer_page,
+ unsigned int *buf_size)
+{
+ int result = 0;
+ char *buffer_virt = kmap(buffer_page);
+
+ /*
+ * Only call start_new_readahead if we don't have a dedicated thread
+ * and we're the queue flusher.
+ */
+ if (current == toi_queue_flusher) {
+ int result2 = toi_start_new_readahead(0);
+ if (result2) {
+ printk(KERN_DEBUG "Queue flusher and "
+ "toi_start_one_readahead returned non-zero.\n");
+ result = -EIO;
+ goto out;
+ }
+ }
+
+ my_mutex_lock(0, &toi_bio_mutex);
+
+ /*
+ * Structure in the image:
+ * [destination pfn|page size|page data]
+ * buf_size is PAGE_SIZE
+ */
+ if (toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
+ toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
+ toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
+ abort_hibernate(TOI_FAILED_IO, "Read of data failed.");
+ result = 1;
+ }
+
+ my_mutex_unlock(0, &toi_bio_mutex);
+out:
+ kunmap(buffer_page);
+ return result;
+}
+
+/**
+ * toi_bio_write_page - write a page of the image
+ * @pfn: The pfn where the data belongs.
+ * @buffer_page: The page containing the (possibly compressed) data.
+ * @buf_size: The number of bytes on @buffer_page used.
+ *
+ * Write a (possibly compressed) page to the image from the buffer, together
+ * with it's index and buffer size.
+ **/
+static int toi_bio_write_page(unsigned long pfn, struct page *buffer_page,
+ unsigned int buf_size)
+{
+ char *buffer_virt;
+ int result = 0, result2 = 0;
+
+ if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
+ return 0;
+
+ my_mutex_lock(1, &toi_bio_mutex);
+
+ if (test_result_state(TOI_ABORTED)) {
+ my_mutex_unlock(1, &toi_bio_mutex);
+ return -EIO;
+ }
+
+ buffer_virt = kmap(buffer_page);
+
+ /*
+ * Structure in the image:
+ * [destination pfn|page size|page data]
+ * buf_size is PAGE_SIZE
+ */
+ if (toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
+ toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
+ toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
+ printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
+ "toi_bio_write_page.\n");
+ result = -EIO;
+ }
+
+ kunmap(buffer_page);
+ my_mutex_unlock(1, &toi_bio_mutex);
+
+ if (current == toi_queue_flusher)
+ result2 = toi_bio_queue_flush_pages(0);
+
+ return result ? result : result2;
+}
+
+/**
+ * _toi_rw_header_chunk - read or write a portion of the image header
+ * @writing: Whether reading or writing.
+ * @owner: The module for which we're writing.
+ * Used for confirming that modules
+ * don't use more header space than they asked for.
+ * @buffer: Address of the data to write.
+ * @buffer_size: Size of the data buffer.
+ * @no_readahead: Don't try to start readhead (when getting extents).
+ *
+ * Perform PAGE_SIZE I/O. Start readahead if needed.
+ **/
+static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
+ char *buffer, int buffer_size, int no_readahead)
+{
+ int result = 0;
+
+ if (owner) {
+ owner->header_used += buffer_size;
+ toi_message(TOI_HEADER, TOI_LOW, 1,
+ "Header: %s : %d bytes (%d/%d).\n",
+ owner->name,
+ buffer_size, owner->header_used,
+ owner->header_requested);
+ if (owner->header_used > owner->header_requested) {
+ printk(KERN_EMERG "TuxOnIce module %s is using more "
+ "header space (%u) than it requested (%u).\n",
+ owner->name,
+ owner->header_used,
+ owner->header_requested);
+ return buffer_size;
+ }
+ } else {
+ unowned += buffer_size;
+ toi_message(TOI_HEADER, TOI_LOW, 1,
+ "Header: (No owner): %d bytes (%d total so far)\n",
+ buffer_size, unowned);
+ }
+
+ if (!writing && !no_readahead)
+ result = toi_start_new_readahead(0);
+
+ if (!result)
+ result = toi_rw_buffer(writing, buffer, buffer_size,
+ no_readahead);
+
+ total_header_bytes += buffer_size;
+ return result;
+}
+
+static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
+ char *buffer, int size)
+{
+ return _toi_rw_header_chunk(writing, owner, buffer, size, 0);
+}
+
+static int toi_rw_header_chunk_noreadahead(int writing,
+ struct toi_module_ops *owner, char *buffer, int size)
+{
+ return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
+}
+
+/**
+ * write_header_chunk_finish - flush any buffered header data
+ **/
+static int write_header_chunk_finish(void)
+{
+ int result = 0;
+
+ if (toi_writer_buffer_posn)
+ toi_bio_queue_write(&toi_writer_buffer);
+
+ result = toi_finish_all_io();
+
+ unowned = 0;
+ total_header_bytes = 0;
+ return result;
+}
+
+/**
+ * toi_bio_storage_needed - get the amount of storage needed for my fns
+ **/
+static int toi_bio_storage_needed(void)
+{
+ return sizeof(int);
+}
+
+/**
+ * toi_bio_save_config_info - save block I/O config to image header
+ * @buf: PAGE_SIZE'd buffer into which data should be saved.
+ **/
+static int toi_bio_save_config_info(char *buf)
+{
+ int *ints = (int *) buf;
+ ints[0] = target_outstanding_io;
+ return sizeof(int);
+}
+
+/**
+ * toi_bio_load_config_info - restore block I/O config
+ * @buf: Data to be reloaded.
+ * @size: Size of the buffer saved.
+ **/
+static void toi_bio_load_config_info(char *buf, int size)
+{
+ int *ints = (int *) buf;
+ target_outstanding_io = ints[0];
+}
+
+/**
+ * toi_bio_initialise - initialise bio code at start of some action
+ * @starting_cycle: Whether starting a hibernation cycle, or just reading or
+ * writing a sysfs value.
+ **/
+static int toi_bio_initialise(int starting_cycle)
+{
+ if (starting_cycle) {
+ max_outstanding_writes = 0;
+ max_outstanding_reads = 0;
+ toi_queue_flusher = current;
+#ifdef MEASURE_MUTEX_CONTENTION
+ {
+ int i, j, k;
+
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < 2; j++)
+ for_each_online_cpu(k)
+ mutex_times[i][j][k] = 0;
+ }
+#endif
+ }
+
+ return 0;
+}
+
+/**
+ * toi_bio_cleanup - cleanup after some action
+ * @finishing_cycle: Whether completing a cycle.
+ **/
+static void toi_bio_cleanup(int finishing_cycle)
+{
+ if (toi_writer_buffer) {
+ toi_free_page(11, (unsigned long) toi_writer_buffer);
+ toi_writer_buffer = NULL;
+ }
+}
+
+struct toi_bio_ops toi_bio_ops = {
+ .bdev_page_io = toi_bdev_page_io,
+ .finish_all_io = toi_finish_all_io,
+ .update_throughput_throttle = update_throughput_throttle,
+ .forward_one_page = go_next_page,
+ .set_extra_page_forward = set_extra_page_forward,
+ .set_devinfo = toi_set_devinfo,
+ .read_page = toi_bio_read_page,
+ .write_page = toi_bio_write_page,
+ .rw_init = toi_rw_init,
+ .rw_cleanup = toi_rw_cleanup,
+ .read_header_init = toi_read_header_init,
+ .rw_header_chunk = toi_rw_header_chunk,
+ .rw_header_chunk_noreadahead = toi_rw_header_chunk_noreadahead,
+ .write_header_chunk_finish = write_header_chunk_finish,
+ .io_flusher = bio_io_flusher,
+};
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
+ 0, 16384, 0, NULL),
+};
+
+static struct toi_module_ops toi_blockwriter_ops = {
+ .name = "lowlevel i/o",
+ .type = MISC_HIDDEN_MODULE,
+ .directory = "block_io",
+ .module = THIS_MODULE,
+ .print_debug_info = toi_bio_print_debug_stats,
+ .memory_needed = toi_bio_memory_needed,
+ .storage_needed = toi_bio_storage_needed,
+ .save_config_info = toi_bio_save_config_info,
+ .load_config_info = toi_bio_load_config_info,
+ .initialise = toi_bio_initialise,
+ .cleanup = toi_bio_cleanup,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/**
+ * toi_block_io_load - load time routine for block I/O module
+ *
+ * Register block i/o ops and sysfs entries.
+ **/
+static __init int toi_block_io_load(void)
+{
+ return toi_register_module(&toi_blockwriter_ops);
+}
+
+late_initcall(toi_block_io_load);
diff --git a/kernel/power/tuxonice_block_io.h b/kernel/power/tuxonice_block_io.h
new file mode 100644
index 0000000..b18298c
--- /dev/null
+++ b/kernel/power/tuxonice_block_io.h
@@ -0,0 +1,59 @@
+/*
+ * kernel/power/tuxonice_block_io.h
+ *
+ * Copyright (C) 2004-2008 Nigel Cunningham (nigel at tuxonice net)
+ * Copyright (C) 2006 Red Hat, inc.
+ *
+ * Distributed under GPLv2.
+ *
+ * This file contains declarations for functions exported from
+ * tuxonice_block_io.c, which contains low level io functions.
+ */
+
+#include <linux/buffer_head.h>
+#include "tuxonice_extent.h"
+
+struct toi_bdev_info {
+ struct block_device *bdev;
+ dev_t dev_t;
+ int bmap_shift;
+ int blocks_per_page;
+ int ignored;
+};
+
+/*
+ * Our exported interface so the swapwriter and filewriter don't
+ * need these functions duplicated.
+ */
+struct toi_bio_ops {
+ int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
+ struct page *page);
+ void (*check_io_stats) (void);
+ void (*reset_io_stats) (void);
+ void (*update_throughput_throttle) (int jif_index);
+ int (*finish_all_io) (void);
+ int (*forward_one_page) (int writing, int section_barrier);
+ void (*set_extra_page_forward) (void);
+ void (*set_devinfo) (struct toi_bdev_info *info);
+ int (*read_page) (unsigned long *index, struct page *buffer_page,
+ unsigned int *buf_size);
+ int (*write_page) (unsigned long index, struct page *buffer_page,
+ unsigned int buf_size);
+ void (*read_header_init) (void);
+ int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
+ char *buffer, int buffer_size);
+ int (*rw_header_chunk_noreadahead) (int rw,
+ struct toi_module_ops *owner,
+ char *buffer, int buffer_size);
+ int (*write_header_chunk_finish) (void);
+ int (*rw_init) (int rw, int stream_number);
+ int (*rw_cleanup) (int rw);
+ int (*io_flusher) (int rw);
+};
+
+extern struct toi_bio_ops toi_bio_ops;
+
+extern char *toi_writer_buffer;
+extern int toi_writer_buffer_posn;
+extern struct hibernate_extent_iterate_saved_state toi_writer_posn_save[4];
+extern struct toi_extent_iterate_state toi_writer_posn;
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
Documentation/power/tuxonice-internals.txt | 477 ++++++++++++++
Documentation/power/tuxonice.txt | 924 ++++++++++++++++++++++++++++
MAINTAINERS | 7 +
3 files changed, 1408 insertions(+), 0 deletions(-)
create mode 100644 Documentation/power/tuxonice-internals.txt
create mode 100644 Documentation/power/tuxonice.txt
diff --git a/Documentation/power/tuxonice-internals.txt b/Documentation/power/tuxonice-internals.txt
new file mode 100644
index 0000000..7a96186
--- /dev/null
+++ b/Documentation/power/tuxonice-internals.txt
@@ -0,0 +1,477 @@
+ TuxOnIce 3.0 Internal Documentation.
+ Updated to 26 March 2009
+
+1. Introduction.
+
+ TuxOnIce 3.0 is an addition to the Linux Kernel, designed to
+ allow the user to quickly shutdown and quickly boot a computer, without
+ needing to close documents or programs. It is equivalent to the
+ hibernate facility in some laptops. This implementation, however,
+ requires no special BIOS or hardware support.
+
+ The code in these files is based upon the original implementation
+ prepared by Gabor Kuti and additional work by Pavel Machek and a
+ host of others. This code has been substantially reworked by Nigel
+ Cunningham, again with the help and testing of many others, not the
+ least of whom is Michael Frank. At its heart, however, the operation is
+ essentially the same as Gabor's version.
+
+2. Overview of operation.
+
+ The basic sequence of operations is as follows:
+
+ a. Quiesce all other activity.
+ b. Ensure enough memory and storage space are available, and attempt
+ to free memory/storage if necessary.
+ c. Allocate the required memory and storage space.
+ d. Write the image.
+ e. Power down.
+
+ There are a number of complicating factors which mean that things are
+ not as simple as the above would imply, however...
+
+ o The activity of each process must be stopped at a point where it will
+ not be holding locks necessary for saving the image, or unexpectedly
+ restart operations due to something like a timeout and thereby make
+ our image inconsistent.
+
+ o It is desirous that we sync outstanding I/O to disk before calculating
+ image statistics. This reduces corruption if one should suspend but
+ then not resume, and also makes later parts of the operation safer (see
+ below).
+
+ o We need to get as close as we can to an atomic copy of the data.
+ Inconsistencies in the image will result in inconsistent memory contents at
+ resume time, and thus in instability of the system and/or file system
+ corruption. This would appear to imply a maximum image size of one half of
+ the amount of RAM, but we have a solution... (again, below).
+
+ o In 2.6, we choose to play nicely with the other suspend-to-disk
+ implementations.
+
+3. Detailed description of internals.
+
+ a. Quiescing activity.
+
+ Safely quiescing the system is achieved using three separate but related
+ aspects.
+
+ First, we note that the vast majority of processes don't need to run during
+ suspend. They can be 'frozen'. We therefore implement a refrigerator
+ routine, which processes enter and in which they remain until the cycle is
+ complete. Processes enter the refrigerator via try_to_freeze() invocations
+ at appropriate places. A process cannot be frozen in any old place. It
+ must not be holding locks that will be needed for writing the image or
+ freezing other processes. For this reason, userspace processes generally
+ enter the refrigerator via the signal handling code, and kernel threads at
+ the place in their event loops where they drop locks and yield to other
+ processes or sleep.
+
+ The task of freezing processes is complicated by the fact that there can be
+ interdependencies between processes. Freezing process A before process B may
+ mean that process B cannot be frozen, because it stops at waiting for
+ process A rather than in the refrigerator. This issue is seen where
+ userspace waits on freezeable kernel threads or fuse filesystem threads. To
+ address this issue, we implement the following algorithm for quiescing
+ activity:
+
+ - Freeze filesystems (including fuse - userspace programs starting
+ new requests are immediately frozen; programs already running
+ requests complete their work before being frozen in the next
+ step)
+ - Freeze userspace
+ - Thaw filesystems (this is safe now that userspace is frozen and no
+ fuse requests are outstanding).
+ - Invoke sys_sync (noop on fuse).
+ - Freeze filesystems
+ - Freeze kernel threads
+
+ If we need to free memory, we thaw kernel threads and filesystems, but not
+ userspace. We can then free caches without worrying about deadlocks due to
+ swap files being on frozen filesystems or such like.
+
+ b. Ensure enough memory & storage are available.
+
+ We have a number of constraints to meet in order to be able to successfully
+ suspend and resume.
+
+ First, the image will be written in two parts, described below. One of these
+ parts needs to have an atomic copy made, which of course implies a maximum
+ size of one half of the amount of system memory. The other part ('pageset')
+ is not atomically copied, and can therefore be as large or small as desired.
+
+ Second, we have constraints on the amount of storage available. In these
+ calculations, we may also consider any compression that will be done. The
+ cryptoapi module allows the user to configure an expected compression ratio.
+
+ Third, the user can specify an arbitrary limit on the image size, in
+ megabytes. This limit is treated as a soft limit, so that we don't fail the
+ attempt to suspend if we cannot meet this constraint.
+
+ c. Allocate the required memory and storage space.
+
+ Having done the initial freeze, we determine whether the above constraints
+ are met, and seek to allocate the metadata for the image. If the constraints
+ are not met, or we fail to allocate the required space for the metadata, we
+ seek to free the amount of memory that we calculate is needed and try again.
+ We allow up to four iterations of this loop before aborting the cycle. If we
+ do fail, it should only be because of a bug in TuxOnIce's calculations.
+
+ These steps are merged together in the prepare_image function, found in
+ prepare_image.c. The functions are merged because of the cyclical nature
+ of the problem of calculating how much memory and storage is needed. Since
+ the data structures containing the information about the image must
+ themselves take memory and use storage, the amount of memory and storage
+ required changes as we prepare the image. Since the changes are not large,
+ only one or two iterations will be required to achieve a solution.
+
+ The recursive nature of the algorithm is miminised by keeping user space
+ frozen while preparing the image, and by the fact that our records of which
+ pages are to be saved and which pageset they are saved in use bitmaps (so
+ that changes in number or fragmentation of the pages to be saved don't
+ feedback via changes in the amount of memory needed for metadata). The
+ recursiveness is thus limited to any extra slab pages allocated to store the
+ extents that record storage used, and the effects of seeking to free memory.
+
+ d. Write the image.
+
+ We previously mentioned the need to create an atomic copy of the data, and
+ the half-of-memory limitation that is implied in this. This limitation is
+ circumvented by dividing the memory to be saved into two parts, called
+ pagesets.
+
+ Pageset2 contains most of the page cache - the pages on the active and
+ inactive LRU lists that aren't needed or modified while TuxOnIce is
+ running, so they can be safely written without an atomic copy. They are
+ therefore saved first and reloaded last. While saving these pages,
+ TuxOnIce carefully ensures that the work of writing the pages doesn't make
+ the image inconsistent. With the support for Kernel (Video) Mode Setting
+ going into the kernel at the time of writing, we need to check for pages
+ on the LRU that are used by KMS, and exclude them from pageset2. They are
+ atomically copied as part of pageset 1.
+
+ Once pageset2 has been saved, we prepare to do the atomic copy of remaining
+ memory. As part of the preparation, we power down drivers, thereby providing
+ them with the opportunity to have their state recorded in the image. The
+ amount of memory allocated by drivers for this is usually negligible, but if
+ DRI is in use, video drivers may require significants amounts. Ideally we
+ would be able to query drivers while preparing the image as to the amount of
+ memory they will need. Unfortunately no such mechanism exists at the time of
+ writing. For this reason, TuxOnIce allows the user to set an
+ 'extra_pages_allowance', which is used to seek to ensure sufficient memory
+ is available for drivers at this point. TuxOnIce also lets the user set this
+ value to 0. In this case, a test driver suspend is done while preparing the
+ image, and the difference (plus a margin) used instead. TuxOnIce will also
+ automatically restart the hibernation process (twice at most) if it finds
+ that the extra pages allowance is not sufficient. It will then use what was
+ actually needed (plus a margin, again). Failure to hibernate should thus
+ be an extremely rare occurence.
+
+ Having suspended the drivers, we save the CPU context before making an
+ atomic copy of pageset1, resuming the drivers and saving the atomic copy.
+ After saving the two pagesets, we just need to save our metadata before
+ powering down.
+
+ As we mentioned earlier, the contents of pageset2 pages aren't needed once
+ they've been saved. We therefore use them as the destination of our atomic
+ copy. In the unlikely event that pageset1 is larger, extra pages are
+ allocated while the image is being prepared. This is normally only a real
+ possibility when the system has just been booted and the page cache is
+ small.
+
+ This is where we need to be careful about syncing, however. Pageset2 will
+ probably contain filesystem meta data. If this is overwritten with pageset1
+ and then a sync occurs, the filesystem will be corrupted - at least until
+ resume time and another sync of the restored data. Since there is a
+ possibility that the user might not resume or (may it never be!) that
+ TuxOnIce might oops, we do our utmost to avoid syncing filesystems after
+ copying pageset1.
+
+ e. Power down.
+
+ Powering down uses standard kernel routines. TuxOnIce supports powering down
+ using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off.
+ Supporting suspend to ram (S3) as a power off option might sound strange,
+ but it allows the user to quickly get their system up and running again if
+ the battery doesn't run out (we just need to re-read the overwritten pages)
+ and if the battery does run out (or the user removes power), they can still
+ resume.
+
+4. Data Structures.
+
+ TuxOnIce uses three main structures to store its metadata and configuration
+ information:
+
+ a) Pageflags bitmaps.
+
+ TuxOnIce records which pages will be in pageset1, pageset2, the destination
+ of the atomic copy and the source of the atomically restored image using
+ bitmaps. The code used is that written for swsusp, with small improvements
+ to match TuxOnIce's requirements.
+
+ The pageset1 bitmap is thus easily stored in the image header for use at
+ resume time.
+
+ As mentioned above, using bitmaps also means that the amount of memory and
+ storage required for recording the above information is constant. This
+ greatly simplifies the work of preparing the image. In earlier versions of
+ TuxOnIce, extents were used to record which pages would be stored. In that
+ case, however, eating memory could result in greater fragmentation of the
+ lists of pages, which in turn required more memory to store the extents and
+ more storage in the image header. These could in turn require further
+ freeing of memory, and another iteration. All of this complexity is removed
+ by having bitmaps.
+
+ Bitmaps also make a lot of sense because TuxOnIce only ever iterates
+ through the lists. There is therefore no cost to not being able to find the
+ nth page in order 0 time. We only need to worry about the cost of finding
+ the n+1th page, given the location of the nth page. Bitwise optimisations
+ help here.
+
+ b) Extents for block data.
+
+ TuxOnIce supports writing the image to multiple block devices. In the case
+ of swap, multiple partitions and/or files may be in use, and we happily use
+ them all (with the exception of compcache pages, which we allocate but do
+ not use). This use of multiple block devices is accomplished as follows:
+
+ Whatever the actual source of the allocated storage, the destination of the
+ image can be viewed in terms of one or more block devices, and on each
+ device, a list of sectors. To simplify matters, we only use contiguous,
+ PAGE_SIZE aligned sectors, like the swap code does.
+
+ Since sector numbers on each bdev may well not start at 0, it makes much
+ more sense to use extents here. Contiguous ranges of pages can thus be
+ represented in the extents by contiguous values.
+
+ Variations in block size are taken account of in transforming this data
+ into the parameters for bio submission.
+
+ We can thus implement a layer of abstraction wherein the core of TuxOnIce
+ doesn't have to worry about which device we're currently writing to or
+ where in the device we are. It simply requests that the next page in the
+ pageset or header be written, leaving the details to this lower layer.
+ The lower layer remembers where in the sequence of devices and blocks each
+ pageset starts. The header always starts at the beginning of the allocated
+ storage.
+
+ So extents are:
+
+ struct extent {
+ unsigned long minimum, maximum;
+ struct extent *next;
+ }
+
+ These are combined into chains of extents for a device:
+
+ struct extent_chain {
+ int size; /* size of the extent ie sum (max-min+1) */
+ int allocs, frees;
+ char *name;
+ struct extent *first, *last_touched;
+ };
+
+ For each bdev, we need to store a little more info:
+
+ struct suspend_bdev_info {
+ struct block_device *bdev;
+ dev_t dev_t;
+ int bmap_shift;
+ int blocks_per_page;
+ };
+
+ The dev_t is used to identify the device in the stored image. As a result,
+ we expect devices at resume time to have the same major and minor numbers
+ as they had while suspending. This is primarily a concern where the user
+ utilises LVM for storage, as they will need to dmsetup their partitions in
+ such a way as to maintain this consistency at resume time.
+
+ bmap_shift and blocks_per_page apply the effects of variations in blocks
+ per page settings for the filesystem and underlying bdev. For most
+ filesystems, these are the same, but for xfs, they can have independant
+ values.
+
+ Combining these two structures together, we have everything we need to
+ record what devices and what blocks on each device are being used to
+ store the image, and to submit i/o using bio_submit.
+
+ The last elements in the picture are a means of recording how the storage
+ is being used.
+
+ We do this first and foremost by implementing a layer of abstraction on
+ top of the devices and extent chains which allows us to view however many
+ devices there might be as one long storage tape, with a single 'head' that
+ tracks a 'current position' on the tape:
+
+ struct extent_iterate_state {
+ struct extent_chain *chains;
+ int num_chains;
+ int current_chain;
+ struct extent *current_extent;
+ unsigned long current_offset;
+ };
+
+ That is, *chains points to an array of size num_chains of extent chains.
+ For the filewriter, this is always a single chain. For the swapwriter, the
+ array is of size MAX_SWAPFILES.
+
+ current_chain, current_extent and current_offset thus point to the current
+ index in the chains array (and into a matching array of struct
+ suspend_bdev_info), the current extent in that chain (to optimise access),
+ and the current value in the offset.
+
+ The image is divided into three parts:
+ - The header
+ - Pageset 1
+ - Pageset 2
+
+ The header always starts at the first device and first block. We know its
+ size before we begin to save the image because we carefully account for
+ everything that will be stored in it.
+
+ The second pageset (LRU) is stored first. It begins on the next page after
+ the end of the header.
+
+ The first pageset is stored second. It's start location is only known once
+ pageset2 has been saved, since pageset2 may be compressed as it is written.
+ This location is thus recorded at the end of saving pageset2. It is page
+ aligned also.
+
+ Since this information is needed at resume time, and the location of extents
+ in memory will differ at resume time, this needs to be stored in a portable
+ way:
+
+ struct extent_iterate_saved_state {
+ int chain_num;
+ int extent_num;
+ unsigned long offset;
+ };
+
+ We can thus implement a layer of abstraction wherein the core of TuxOnIce
+ doesn't have to worry about which device we're currently writing to or
+ where in the device we are. It simply requests that the next page in the
+ pageset or header be written, leaving the details to this layer, and
+ invokes the routines to remember and restore the position, without having
+ to worry about the details of how the data is arranged on disk or such like.
+
+ c) Modules
+
+ One aim in designing TuxOnIce was to make it flexible. We wanted to allow
+ for the implementation of different methods of transforming a page to be
+ written to disk and different methods of getting the pages stored.
+
+ In early versions (the betas and perhaps Suspend1), compression support was
+ inlined in the image writing code, and the data structures and code for
+ managing swap were intertwined with the rest of the code. A number of people
+ had expressed interest in implementing image encryption, and alternative
+ methods of storing the image.
+
+ In order to achieve this, TuxOnIce was given a modular design.
+
+ A module is a single file which encapsulates the functionality needed
+ to transform a pageset of data (encryption or compression, for example),
+ or to write the pageset to a device. The former type of module is called
+ a 'page-transformer', the later a 'writer'.
+
+ Modules are linked together in pipeline fashion. There may be zero or more
+ page transformers in a pipeline, and there is always exactly one writer.
+ The pipeline follows this pattern:
+
+ ---------------------------------
+ | TuxOnIce Core |
+ ---------------------------------
+ |
+ |
+ ---------------------------------
+ | Page transformer 1 |
+ ---------------------------------
+ |
+ |
+ ---------------------------------
+ | Page transformer 2 |
+ ---------------------------------
+ |
+ |
+ ---------------------------------
+ | Writer |
+ ---------------------------------
+
+ During the writing of an image, the core code feeds pages one at a time
+ to the first module. This module performs whatever transformations it
+ implements on the incoming data, completely consuming the incoming data and
+ feeding output in a similar manner to the next module.
+
+ All routines are SMP safe, and the final result of the transformations is
+ written with an index (provided by the core) and size of the output by the
+ writer. As a result, we can have multithreaded I/O without needing to
+ worry about the sequence in which pages are written (or read).
+
+ During reading, the pipeline works in the reverse direction. The core code
+ calls the first module with the address of a buffer which should be filled.
+ (Note that the buffer size is always PAGE_SIZE at this time). This module
+ will in turn request data from the next module and so on down until the
+ writer is made to read from the stored image.
+
+ Part of definition of the structure of a module thus looks like this:
+
+ int (*rw_init) (int rw, int stream_number);
+ int (*rw_cleanup) (int rw);
+ int (*write_chunk) (struct page *buffer_page);
+ int (*read_chunk) (struct page *buffer_page, int sync);
+
+ It should be noted that the _cleanup routine may be called before the
+ full stream of data has been read or written. While writing the image,
+ the user may (depending upon settings) choose to abort suspending, and
+ if we are in the midst of writing the last portion of the image, a portion
+ of the second pageset may be reread. This may also happen if an error
+ occurs and we seek to abort the process of writing the image.
+
+ The modular design is also useful in a number of other ways. It provides
+ a means where by we can add support for:
+
+ - providing overall initialisation and cleanup routines;
+ - serialising configuration information in the image header;
+ - providing debugging information to the user;
+ - determining memory and image storage requirements;
+ - dis/enabling components at run-time;
+ - configuring the module (see below);
+
+ ...and routines for writers specific to their work:
+ - Parsing a resume= location;
+ - Determining whether an image exists;
+ - Marking a resume as having been attempted;
+ - Invalidating an image;
+
+ Since some parts of the core - the user interface and storage manager
+ support - have use for some of these functions, they are registered as
+ 'miscellaneous' modules as well.
+
+ d) Sysfs data structures.
+
+ This brings us naturally to support for configuring TuxOnIce. We desired to
+ provide a way to make TuxOnIce as flexible and configurable as possible.
+ The user shouldn't have to reboot just because they want to now hibernate to
+ a file instead of a partition, for example.
+
+ To accomplish this, TuxOnIce implements a very generic means whereby the
+ core and modules can register new sysfs entries. All TuxOnIce entries use
+ a single _store and _show routine, both of which are found in
+ tuxonice_sysfs.c in the kernel/power directory. These routines handle the
+ most common operations - getting and setting the values of bits, integers,
+ longs, unsigned longs and strings in one place, and allow overrides for
+ customised get and set options as well as side-effect routines for all
+ reads and writes.
+
+ When combined with some simple macros, a new sysfs entry can then be defined
+ in just a couple of lines:
+
+ SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
+ 2048, 0, NULL),
+
+ This defines a sysfs entry named "progress_granularity" which is rw and
+ allows the user to access an integer stored at &progress_granularity, giving
+ it a value between 1 and 2048 inclusive.
+
+ Sysfs entries are registered under /sys/power/tuxonice, and entries for
+ modules are located in a subdirectory named after the module.
+
diff --git a/Documentation/power/tuxonice.txt b/Documentation/power/tuxonice.txt
new file mode 100644
index 0000000..bfc0df5
--- /dev/null
+++ b/Documentation/power/tuxonice.txt
@@ -0,0 +1,924 @@
+ --- TuxOnIce, version 3.0 ---
+
+1. What is it?
+2. Why would you want it?
+3. What do you need to use it?
+4. Why not just use the version already in the kernel?
+5. How do you use it?
+6. What do all those entries in /sys/power/tuxonice do?
+7. How do you get support?
+8. I think I've found a bug. What should I do?
+9. When will XXX be supported?
+10 How does it work?
+11. Who wrote TuxOnIce?
+
+1. What is it?
+
+ Imagine you're sitting at your computer, working away. For some reason, you
+ need to turn off your computer for a while - perhaps it's time to go home
+ for the day. When you come back to your computer next, you're going to want
+ to carry on where you left off. Now imagine that you could push a button and
+ have your computer store the contents of its memory to disk and power down.
+ Then, when you next start up your computer, it loads that image back into
+ memory and you can carry on from where you were, just as if you'd never
+ turned the computer off. You have far less time to start up, no reopening of
+ applications or finding what directory you put that file in yesterday.
+ That's what TuxOnIce does.
+
+ TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who,
+ with some help from Pavel Machek, got an early version going in 1999. The
+ project was then taken over by Florent Chabaud while still in alpha version
+ numbers. Nigel Cunningham came on the scene when Florent was unable to
+ continue, moving the project into betas, then 1.0, 2.0 and so on up to
+ the present series. During the 2.0 series, the name was contracted to
+ Suspend2 and the website suspend2.net created. Beginning around July 2007,
+ a transition to calling the software TuxOnIce was made, to seek to help
+ make it clear that TuxOnIce is more concerned with hibernation than suspend
+ to ram.
+
+ Pavel Machek's swsusp code, which was merged around 2.5.17 retains the
+ original name, and was essentially a fork of the beta code until Rafael
+ Wysocki came on the scene in 2005 and began to improve it further.
+
+2. Why would you want it?
+
+ Why wouldn't you want it?
+
+ Being able to save the state of your system and quickly restore it improves
+ your productivity - you get a useful system in far less time than through
+ the normal boot process. You also get to be completely 'green', using zero
+ power, or as close to that as possible (the computer may still provide
+ minimal power to some devices, so they can initiate a power on, but that
+ will be the same amount of power as would be used if you told the computer
+ to shutdown.
+
+3. What do you need to use it?
+
+ a. Kernel Support.
+
+ i) The TuxOnIce patch.
+
+ TuxOnIce is part of the Linux Kernel. This version is not part of Linus's
+ 2.6 tree at the moment, so you will need to download the kernel source and
+ apply the latest patch. Having done that, enable the appropriate options in
+ make [menu|x]config (under Power Management Options - look for "Enhanced
+ Hibernation"), compile and install your kernel. TuxOnIce works with SMP,
+ Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64.
+
+ TuxOnIce patches are available from http://tuxonice.net.
+
+ ii) Compression support.
+
+ Compression support is implemented via the cryptoapi. You will therefore want
+ to select any Cryptoapi transforms that you want to use on your image from
+ the Cryptoapi menu while configuring your kernel. We recommend the use of the
+ LZO compression method - it is very fast and still achieves good compression.
+
+ You can also tell TuxOnIce to write its image to an encrypted and/or
+ compressed filesystem/swap partition. In that case, you don't need to do
+ anything special for TuxOnIce when it comes to kernel configuration.
+
+ iii) Configuring other options.
+
+ While you're configuring your kernel, try to configure as much as possible
+ to build as modules. We recommend this because there are a number of drivers
+ that are still in the process of implementing proper power management
+ support. In those cases, the best way to work around their current lack is
+ to build them as modules and remove the modules while hibernating. You might
+ also bug the driver authors to get their support up to speed, or even help!
+
+ b. Storage.
+
+ i) Swap.
+
+ TuxOnIce can store the hibernation image in your swap partition, a swap file or
+ a combination thereof. Whichever combination you choose, you will probably
+ want to create enough swap space to store the largest image you could have,
+ plus the space you'd normally use for swap. A good rule of thumb would be
+ to calculate the amount of swap you'd want without using TuxOnIce, and then
+ add the amount of memory you have. This swapspace can be arranged in any way
+ you'd like. It can be in one partition or file, or spread over a number. The
+ only requirement is that they be active when you start a hibernation cycle.
+
+ There is one exception to this requirement. TuxOnIce has the ability to turn
+ on one swap file or partition at the start of hibernating and turn it back off
+ at the end. If you want to ensure you have enough memory to store a image
+ when your memory is fully used, you might want to make one swap partition or
+ file for 'normal' use, and another for TuxOnIce to activate & deactivate
+ automatically. (Further details below).
+
+ ii) Normal files.
+
+ TuxOnIce includes a 'file allocator'. The file allocator can store your
+ image in a simple file. Since Linux has the concept of everything being a
+ file, this is more powerful than it initially sounds. If, for example, you
+ were to set up a network block device file, you could hibernate to a network
+ server. This has been tested and works to a point, but nbd itself isn't
+ stateless enough for our purposes.
+
+ Take extra care when setting up the file allocator. If you just type
+ commands without thinking and then try to hibernate, you could cause
+ irreversible corruption on your filesystems! Make sure you have backups.
+
+ Most people will only want to hibernate to a local file. To achieve that, do
+ something along the lines of:
+
+ echo "TuxOnIce" > /hibernation-file
+ dd if=/dev/zero bs=1M count=512 >> hibernation-file
+
+ This will create a 512MB file called /hibernation-file. To get TuxOnIce to use
+ it:
+
+ echo /hibernation-file > /sys/power/tuxonice/file/target
+
+ Then
+
+ cat /sys/power/tuxonice/resume
+
+ Put the results of this into your bootloader's configuration (see also step
+ C, below):
+
+ ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
+ # cat /sys/power/tuxonice/resume
+ file:/dev/hda2:0x1e001
+
+ In this example, we would edit the append= line of our lilo.conf|menu.lst
+ so that it included:
+
+ resume=file:/dev/hda2:0x1e001
+ ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
+
+ For those who are thinking 'Could I make the file sparse?', the answer is
+ 'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in
+ a sparse file while hibernating. In the longer term (post merge!), I'd like
+ to change things so that the file could be dynamically resized and have
+ holes filled as needed. Right now, however, that's not possible and not a
+ priority.
+
+ c. Bootloader configuration.
+
+ Using TuxOnIce also requires that you add an extra parameter to
+ your lilo.conf or equivalent. Here's an example for a swap partition:
+
+ append="resume=swap:/dev/hda1"
+
+ This would tell TuxOnIce that /dev/hda1 is a swap partition you
+ have. TuxOnIce will use the swap signature of this partition as a
+ pointer to your data when you hibernate. This means that (in this example)
+ /dev/hda1 doesn't need to be _the_ swap partition where all of your data
+ is actually stored. It just needs to be a swap partition that has a
+ valid signature.
+
+ You don't need to have a swap partition for this purpose. TuxOnIce
+ can also use a swap file, but usage is a little more complex. Having made
+ your swap file, turn it on and do
+
+ cat /sys/power/tuxonice/swap/headerlocations
+
+ (this assumes you've already compiled your kernel with TuxOnIce
+ support and booted it). The results of the cat command will tell you
+ what you need to put in lilo.conf:
+
+ For swap partitions like /dev/hda1, simply use resume=/dev/hda1.
+ For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d.
+
+ If the swapfile changes for any reason (it is moved to a different
+ location, it is deleted and recreated, or the filesystem is
+ defragmented) then you will have to check
+ /sys/power/tuxonice/swap/headerlocations for a new resume_block value.
+
+ Once you've compiled and installed the kernel and adjusted your bootloader
+ configuration, you should only need to reboot for the most basic part
+ of TuxOnIce to be ready.
+
+ If you only compile in the swap allocator, or only compile in the file
+ allocator, you don't need to add the "swap:" part of the resume=
+ parameters above. resume=/dev/hda2:0x242d will work just as well. If you
+ have compiled both and your storage is on swap, you can also use this
+ format (the swap allocator is the default allocator).
+
+ When compiling your kernel, one of the options in the 'Power Management
+ Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is
+ called 'Default resume partition'. This can be used to set a default value
+ for the resume= parameter.
+
+ d. The hibernate script.
+
+ Since the driver model in 2.6 kernels is still being developed, you may need
+ to do more than just configure TuxOnIce. Users of TuxOnIce usually start the
+ process via a script which prepares for the hibernation cycle, tells the
+ kernel to do its stuff and then restore things afterwards. This script might
+ involve:
+
+ - Switching to a text console and back if X doesn't like the video card
+ status on resume.
+ - Un/reloading drivers that don't play well with hibernation.
+
+ Note that you might not be able to unload some drivers if there are
+ processes using them. You might have to kill off processes that hold
+ devices open. Hint: if your X server accesses an USB mouse, doing a
+ 'chvt' to a text console releases the device and you can unload the
+ module.
+
+ Check out the latest script (available on tuxonice.net).
+
+ e. The userspace user interface.
+
+ TuxOnIce has very limited support for displaying status if you only apply
+ the kernel patch - it can printk messages, but that is all. In addition,
+ some of the functions mentioned in this document (such as cancelling a cycle
+ or performing interactive debugging) are unavailable. To utilise these
+ functions, or simply get a nice display, you need the 'userui' component.
+ Userui comes in three flavours, usplash, fbsplash and text. Text should
+ work on any console. Usplash and fbsplash require the appropriate
+ (distro specific?) support.
+
+ To utilise a userui, TuxOnIce just needs to be told where to find the
+ userspace binary:
+
+ echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program
+
+ The hibernate script can do this for you, and a default value for this
+ setting can be configured when compiling the kernel. This path is also
+ stored in the image header, so if you have an initrd or initramfs, you can
+ use the userui during the first part of resuming (prior to the atomic
+ restore) by putting the binary in the same path in your initrd/ramfs.
+ Alternatively, you can put it in a different location and do an echo
+ similar to the above prior to the echo > do_resume. The value saved in the
+ image header will then be ignored.
+
+4. Why not just use the version already in the kernel?
+
+ The version in the vanilla kernel has a number of drawbacks. The most
+ serious of these are:
+ - it has a maximum image size of 1/2 total memory;
+ - it doesn't allocate storage until after it has snapshotted memory.
+ This means that you can't be sure hibernating will work until you
+ see it start to write the image;
+ - it does not allow you to press escape to cancel a cycle;
+ - it does not allow you to press escape to cancel resuming;
+ - it does not allow you to automatically swapon a file when
+ starting a cycle;
+ - it does not allow you to use multiple swap partitions or files;
+ - it does not allow you to use ordinary files;
+ - it just invalidates an image and continues to boot if you
+ accidentally boot the wrong kernel after hibernating;
+ - it doesn't support any sort of nice display while hibernating;
+ - it is moving toward requiring that you have an initrd/initramfs
+ to ever have a hope of resuming (uswsusp). While uswsusp will
+ address some of the concerns above, it won't address all of them,
+ and will be more complicated to get set up;
+ - it doesn't have support for suspend-to-both (write a hibernation
+ image, then suspend to ram; I think this is known as ReadySafe
+ under M$).
+
+5. How do you use it?
+
+ A hibernation cycle can be started directly by doing:
+
+ echo > /sys/power/tuxonice/do_hibernate
+
+ In practice, though, you'll probably want to use the hibernate script
+ to unload modules, configure the kernel the way you like it and so on.
+ In that case, you'd do (as root):
+
+ hibernate
+
+ See the hibernate script's man page for more details on the options it
+ takes.
+
+ If you're using the text or splash user interface modules, one feature of
+ TuxOnIce that you might find useful is that you can press Escape at any time
+ during hibernating, and the process will be aborted.
+
+ Due to the way hibernation works, this means you'll have your system back and
+ perfectly usable almost instantly. The only exception is when it's at the
+ very end of writing the image. Then it will need to reload a small (usually
+ 4-50MBs, depending upon the image characteristics) portion first.
+
+ Likewise, when resuming, you can press escape and resuming will be aborted.
+ The computer will then powerdown again according to settings at that time for
+ the powerdown method or rebooting.
+
+ You can change the settings for powering down while the image is being
+ written by pressing 'R' to toggle rebooting and 'O' to toggle between
+ suspending to ram and powering down completely).
+
+ If you run into problems with resuming, adding the "noresume" option to
+ the kernel command line will let you skip the resume step and recover your
+ system. This option shouldn't normally be needed, because TuxOnIce modifies
+ the image header prior to the atomic restore, and will thus prompt you
+ if it detects that you've tried to resume an image before (this flag is
+ removed if you press Escape to cancel a resume, so you won't be prompted
+ then).
+
+ Recent kernels (2.6.24 onwards) add support for resuming from a different
+ kernel to the one that was hibernated (thanks to Rafael for his work on
+ this - I've just embraced and enhanced the support for TuxOnIce). This
+ should further reduce the need for you to use the noresume option.
+
+6. What do all those entries in /sys/power/tuxonice do?
+
+ /sys/power/tuxonice is the directory which contains files you can use to
+ tune and configure TuxOnIce to your liking. The exact contents of
+ the directory will depend upon the version of TuxOnIce you're
+ running and the options you selected at compile time. In the following
+ descriptions, names in brackets refer to compile time options.
+ (Note that they're all dependant upon you having selected CONFIG_TUXONICE
+ in the first place!).
+
+ Since the values of these settings can open potential security risks, the
+ writeable ones are accessible only to the root user. You may want to
+ configure sudo to allow you to invoke your hibernate script as an ordinary
+ user.
+
+ - alt_resume_param
+
+ Instead of powering down after writing a hibernation image, TuxOnIce
+ supports resuming from a different image. This entry lets you set the
+ location of the signature for that image (the resume= value you'd use
+ for it). Using an alternate image and keep_image mode, you can do things
+ like using an alternate image to power down an uninterruptible power
+ supply.
+
+ - block_io/target_outstanding_io
+
+ This value controls the amount of memory that the block I/O code says it
+ needs when the core code is calculating how much memory is needed for
+ hibernating and for resuming. It doesn't directly control the amount of
+ I/O that is submitted at any one time - that depends on the amount of
+ available memory (we may have more available than we asked for), the
+ throughput that is being achieved and the ability of the CPU to keep up
+ with disk throughput (particularly where we're compressing pages).
+
+ - compression/algorithm
+
+ Set the cryptoapi algorithm used for compressing the image.
+
+ - compression/expected_compression
+
+ These values allow you to set an expected compression ratio, which TuxOnice
+ will use in calculating whether it meets constraints on the image size. If
+ this expected compression ratio is not attained, the hibernation cycle will
+ abort, so it is wise to allow some spare. You can see what compression
+ ratio is achieved in the logs after hibernating.
+
+ - debug_info:
+
+ This file returns information about your configuration that may be helpful
+ in diagnosing problems with hibernating.
+
+ - did_suspend_to_both:
+
+ This file can be used when you hibernate with powerdown method 3 (ie suspend
+ to ram after writing the image). There can be two outcomes in this case. We
+ can resume from the suspend-to-ram before the battery runs out, or we can run
+ out of juice and and up resuming like normal. This entry lets you find out,
+ post resume, which way we went. If the value is 1, we resumed from suspend
+ to ram. This can be useful when actions need to be run post suspend-to-ram
+ that don't need to be run if we did the normal resume from power off.
+
+ - do_hibernate:
+
+ When anything is written to this file, the kernel side of TuxOnIce will
+ begin to attempt to write an image to disk and power down. You'll normally
+ want to run the hibernate script instead, to get modules unloaded first.
+
+ - do_resume:
+
+ When anything is written to this file TuxOnIce will attempt to read and
+ restore an image. If there is no image, it will return almost immediately.
+ If an image exists, the echo > will never return. Instead, the original
+ kernel context will be restored and the original echo > do_hibernate will
+ return.
+
+ - */enabled
+
+ These option can be used to temporarily disable various parts of TuxOnIce.
+
+ - extra_pages_allowance
+
+ When TuxOnIce does its atomic copy, it calls the driver model suspend
+ and resume methods. If you have DRI enabled with a driver such as fglrx,
+ this can result in the driver allocating a substantial amount of memory
+ for storing its state. Extra_pages_allowance tells TuxOnIce how much
+ extra memory it should ensure is available for those allocations. If
+ your attempts at hibernating end with a message in dmesg indicating that
+ insufficient extra pages were allowed, you need to increase this value.
+
+ - file/target:
+
+ Read this value to get the current setting. Write to it to point TuxOnice
+ at a new storage location for the file allocator. See section 3.b.ii above
+ for details of how to set up the file allocator.
+
+ - freezer_test
+
+ This entry can be used to get TuxOnIce to just test the freezer and prepare
+ an image without actually doing a hibernation cycle. It is useful for
+ diagnosing freezing and image preparation issues.
+
+ - full_pageset2
+
+ TuxOnIce divides the pages that are stored in an image into two sets. The
+ difference between the two sets is that pages in pageset 1 are atomically
+ copied, and pages in pageset 2 are written to disk without being copied
+ first. A page CAN be written to disk without being copied first if and only
+ if its contents will not be modified or used at any time after userspace
+ processes are frozen. A page MUST be in pageset 1 if its contents are
+ modified or used at any time after userspace processes have been frozen.
+
+ Normally (ie if this option is enabled), TuxOnIce will put all pages on the
+ per-zone LRUs in pageset2, then remove those pages used by any userspace
+ user interface helper and TuxOnIce storage manager that are running,
+ together with pages used by the GEM memory manager introduced around 2.6.28
+ kernels.
+
+ If this option is disabled, a much more conservative approach will be taken.
+ The only pages in pageset2 will be those belonging to userspace processes,
+ with the exclusion of those belonging to the TuxOnIce userspace helpers
+ mentioned above. This will result in a much smaller pageset2, and will
+ therefore result in smaller images than are possible with this option
+ enabled.
+
+ - ignore_rootfs
+
+ TuxOnIce records which device is mounted as the root filesystem when
+ writing the hibernation image. It will normally check at resume time that
+ this device isn't already mounted - that would be a cause of filesystem
+ corruption. In some particular cases (RAM based root filesystems), you
+ might want to disable this check. This option allows you to do that.
+
+ - image_exists:
+
+ Can be used in a script to determine whether a valid image exists at the
+ location currently pointed to by resume=. Returns up to three lines.
+ The first is whether an image exists (-1 for unsure, otherwise 0 or 1).
+ If an image eixsts, additional lines will return the machine and version.
+ Echoing anything to this entry removes any current image.
+
+ - image_size_limit:
+
+ The maximum size of hibernation image written to disk, measured in megabytes
+ (1024*1024).
+
+ - last_result:
+
+ The result of the last hibernation cycle, as defined in
+ include/linux/suspend-debug.h with the values SUSPEND_ABORTED to
+ SUSPEND_KEPT_IMAGE. This is a bitmask.
+
+ - late_cpu_hotplug:
+
+ This sysfs entry controls whether cpu hotplugging is done - as normal - just
+ before (unplug) and after (replug) the atomic copy/restore (so that all
+ CPUs/cores are available for multithreaded I/O). The alternative is to
+ unplug all secondary CPUs/cores at the start of hibernating/resuming, and
+ replug them at the end of resuming. No multithreaded I/O will be possible in
+ this configuration, but the odd machine has been reported to require it.
+
+ - lid_file:
+
+ This determines which ACPI button file we look in to determine whether the
+ lid is open or closed after resuming from suspend to disk or power off.
+ If the entry is set to "lid/LID", we'll open /proc/acpi/button/lid/LID/state
+ and check its contents at the appropriate moment. See post_wake_state below
+ for more details on how this entry is used.
+
+ - log_everything (CONFIG_PM_DEBUG):
+
+ Setting this option results in all messages printed being logged. Normally,
+ only a subset are logged, so as to not slow the process and not clutter the
+ logs. Useful for debugging. It can be toggled during a cycle by pressing
+ 'L'.
+
+ - no_load_direct:
+
+ This is a debugging option. If, when loading the atomically copied pages of
+ an image, TuxOnIce finds that the destination address for a page is free,
+ it will normally allocate the image, load the data directly into that
+ address and skip it in the atomic restore. If this option is disabled, the
+ page will be loaded somewhere else and atomically restored like other pages.
+
+ - no_flusher_thread:
+
+ When doing multithreaded I/O (see below), the first online CPU can be used
+ to _just_ submit compressed pages when writing the image, rather than
+ compressing and submitting data. This option is normally disabled, but has
+ been included because Nigel would like to see whether it will be more useful
+ as the number of cores/cpus in computers increases.
+
+ - no_multithreaded_io:
+
+ TuxOnIce will normally create one thread per cpu/core on your computer,
+ each of which will then perform I/O. This will generally result in
+ throughput that's the maximum the storage medium can handle. There
+ shouldn't be any reason to disable multithreaded I/O now, but this option
+ has been retained for debugging purposes.
+
+ - no_pageset2
+
+ See the entry for full_pageset2 above for an explanation of pagesets.
+ Enabling this option causes TuxOnIce to do an atomic copy of all pages,
+ thereby limiting the maximum image size to 1/2 of memory, as swsusp does.
+
+ - no_pageset2_if_unneeded
+
+ See the entry for full_pageset2 above for an explanation of pagesets.
+ Enabling this option causes TuxOnIce to act like no_pageset2 was enabled
+ if and only it isn't needed anyway. This option may still make TuxOnIce
+ less reliable because pageset2 pages are normally used to store the
+ atomic copy - drivers that want to do allocations of larger amounts of
+ memory in one shot will be more likely to find that those amounts aren't
+ available if this option is enabled.
+
+ - pause_between_steps (CONFIG_PM_DEBUG):
+
+ This option is used during debugging, to make TuxOnIce pause between
+ each step of the process. It is ignored when the nice display is on.
+
+ - post_wake_state:
+
+ TuxOnIce provides support for automatically waking after a user-selected
+ delay, and using a different powerdown method if the lid is still closed.
+ (Yes, we're assuming a laptop). This entry lets you choose what state
+ should be entered next. The values are those described under
+ powerdown_method, below. It can be used to suspend to RAM after hibernating,
+ then powerdown properly (say) 20 minutes. It can also be used to power down
+ properly, then wake at (say) 6.30am and suspend to RAM until you're ready
+ to use the machine.
+
+ - powerdown_method:
+
+ Used to select a method by which TuxOnIce should powerdown after writing the
+ image. Currently:
+
+ 0: Don't use ACPI to power off.
+ 3: Attempt to enter Suspend-to-ram.
+ 4: Attempt to enter ACPI S4 mode.
+ 5: Attempt to power down via ACPI S5 mode.
+
+ Note that these options are highly dependant upon your hardware & software:
+
+ 3: When succesful, your machine suspends to ram instead of powering off.
+ The advantage of using this mode is that it doesn't matter whether your
+ battery has enough charge to make it through to your next resume. If it
+ lasts, you will simply resume from suspend to ram (and the image on disk
+ will be discarded). If the battery runs out, you will resume from disk
+ instead. The disadvantage is that it takes longer than a normal
+ suspend-to-ram to enter the state, since the suspend-to-disk image needs
+ to be written first.
+ 4/5: When successful, your machine will be off and comsume (almost) no power.
+ But it might still react to some external events like opening the lid or
+ trafic on a network or usb device. For the bios, resume is then the same
+ as warm boot, similar to a situation where you used the command `reboot'
+ to reboot your machine. If your machine has problems on warm boot or if
+ you want to protect your machine with the bios password, this is probably
+ not the right choice. Mode 4 may be necessary on some machines where ACPI
+ wake up methods need to be run to properly reinitialise hardware after a
+ hibernation cycle.
+ 0: Switch the machine completely off. The only possible wakeup is the power
+ button. For the bios, resume is then the same as a cold boot, in
+ particular you would have to provide your bios boot password if your
+ machine uses that feature for booting.
+
+ - progressbar_granularity_limit:
+
+ This option can be used to limit the granularity of the progress bar
+ displayed with a bootsplash screen. The value is the maximum number of
+ steps. That is, 10 will make the progress bar jump in 10% increments.
+
+ - reboot:
+
+ This option causes TuxOnIce to reboot rather than powering down
+ at the end of saving an image. It can be toggled during a cycle by pressing
+ 'R'.
+
+ - resume:
+
+ This sysfs entry can be used to read and set the location in which TuxOnIce
+ will look for the signature of an image - the value set using resume= at
+ boot time or CONFIG_PM_STD_PARTITION ("Default resume partition"). By
+ writing to this file as well as modifying your bootloader's configuration
+ file (eg menu.lst), you can set or reset the location of your image or the
+ method of storing the image without rebooting.
+
+ - replace_swsusp (CONFIG_TOI_REPLACE_SWSUSP):
+
+ This option makes
+
+ echo disk > /sys/power/state
+
+ activate TuxOnIce instead of swsusp. Regardless of whether this option is
+ enabled, any invocation of swsusp's resume time trigger will cause TuxOnIce
+ to check for an image too. This is due to the fact that at resume time, we
+ can't know whether this option was enabled until we see if an image is there
+ for us to resume from. (And when an image exists, we don't care whether we
+ did replace swsusp anyway - we just want to resume).
+
+ - resume_commandline:
+
+ This entry can be read after resuming to see the commandline that was used
+ when resuming began. You might use this to set up two bootloader entries
+ that are the same apart from the fact that one includes a extra append=
+ argument "at_work=1". You could then grep resume_commandline in your
+ post-resume scripts and configure networking (for example) differently
+ depending upon whether you're at home or work. resume_commandline can be
+ set to arbitrary text if you wish to remove sensitive contents.
+
+ - swap/swapfilename:
+
+ This entry is used to specify the swapfile or partition that
+ TuxOnIce will attempt to swapon/swapoff automatically. Thus, if
+ I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically
+ for my hibernation image, I would
+
+ echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile
+
+ /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the
+ swapon and swapoff occur while other processes are frozen (including kswapd)
+ so this swap file will not be used up when attempting to free memory. The
+ parition/file is also given the highest priority, so other swapfiles/partitions
+ will only be used to save the image when this one is filled.
+
+ The value of this file is used by headerlocations along with any currently
+ activated swapfiles/partitions.
+
+ - swap/headerlocations:
+
+ This option tells you the resume= options to use for swap devices you
+ currently have activated. It is particularly useful when you only want to
+ use a swap file to store your image. See above for further details.
+
+ - test_bio
+
+ This is a debugging option. When enabled, TuxOnIce will not hibernate.
+ Instead, when asked to write an image, it will skip the atomic copy,
+ just doing the writing of the image and then returning control to the
+ user at the point where it would have powered off. This is useful for
+ testing throughput in different configurations.
+
+ - test_filter_speed
+
+ This is a debugging option. When enabled, TuxOnIce will not hibernate.
+ Instead, when asked to write an image, it will not write anything or do
+ an atomic copy, but will only run any enabled compression algorithm on the
+ data that would have been written (the source pages of the atomic copy in
+ the case of pageset 1). This is useful for comparing the performance of
+ compression algorithms and for determining the extent to which an upgrade
+ to your storage method would improve hibernation speed.
+
+ - user_interface/debug_sections (CONFIG_PM_DEBUG):
+
+ This value, together with the console log level, controls what debugging
+ information is displayed. The console log level determines the level of
+ detail, and this value determines what detail is displayed. This value is
+ a bit vector, and the meaning of the bits can be found in the kernel tree
+ in include/linux/tuxonice.h. It can be overridden using the kernel's
+ command line option suspend_dbg.
+
+ - user_interface/default_console_level (CONFIG_PM_DEBUG):
+
+ This determines the value of the console log level at the start of a
+ hibernation cycle. If debugging is compiled in, the console log level can be
+ changed during a cycle by pressing the digit keys. Meanings are:
+
+ 0: Nice display.
+ 1: Nice display plus numerical progress.
+ 2: Errors only.
+ 3: Low level debugging info.
+ 4: Medium level debugging info.
+ 5: High level debugging info.
+ 6: Verbose debugging info.
+
+ - user_interface/enable_escape:
+
+ Setting this to "1" will enable you abort a hibernation cycle or resuming by
+ pressing escape, "0" (default) disables this feature. Note that enabling
+ this option means that you cannot initiate a hibernation cycle and then walk
+ away from your computer, expecting it to be secure. With feature disabled,
+ you can validly have this expectation once TuxOnice begins to write the
+ image to disk. (Prior to this point, it is possible that TuxOnice might
+ about because of failure to freeze all processes or because constraints
+ on its ability to save the image are not met).
+
+ - user_interface/program
+
+ This entry is used to tell TuxOnice what userspace program to use for
+ providing a user interface while hibernating. The program uses a netlink
+ socket to pass messages back and forward to the kernel, allowing all of the
+ functions formerly implemented in the kernel user interface components.
+
+ - version:
+
+ The version of TuxOnIce you have compiled into the currently running kernel.
+
+ - wake_alarm_dir:
+
+ As mentioned above (post_wake_state), TuxOnIce supports automatically waking
+ after some delay. This entry allows you to select which wake alarm to use.
+ It should contain the value "rtc0" if you're wanting to use
+ /sys/class/rtc/rtc0.
+
+ - wake_delay:
+
+ This value determines the delay from the end of writing the image until the
+ wake alarm is triggered. You can set an absolute time by writing the desired
+ time into /sys/class/rtc/<wake_alarm_dir>/wakealarm and leaving these values
+ empty.
+
+ Note that for the wakeup to actually occur, you may need to modify entries
+ in /proc/acpi/wakeup. This is done by echoing the name of the button in the
+ first column (eg PBTN) into the file.
+
+7. How do you get support?
+
+ Glad you asked. TuxOnIce is being actively maintained and supported
+ by Nigel (the guy doing most of the kernel coding at the moment), Bernard
+ (who maintains the hibernate script and userspace user interface components)
+ and its users.
+
+ Resources availble include HowTos, FAQs and a Wiki, all available via
+ tuxonice.net. You can find the mailing lists there.
+
+8. I think I've found a bug. What should I do?
+
+ By far and a way, the most common problems people have with TuxOnIce
+ related to drivers not having adequate power management support. In this
+ case, it is not a bug with TuxOnIce, but we can still help you. As we
+ mentioned above, such issues can usually be worked around by building the
+ functionality as modules and unloading them while hibernating. Please visit
+ the Wiki for up-to-date lists of known issues and work arounds.
+
+ If this information doesn't help, try running:
+
+ hibernate --bug-report
+
+ ..and sending the output to the users mailing list.
+
+ Good information on how to provide us with useful information from an
+ oops is found in the file REPORTING-BUGS, in the top level directory
+ of the kernel tree. If you get an oops, please especially note the
+ information about running what is printed on the screen through ksymoops.
+ The raw information is useless.
+
+9. When will XXX be supported?
+
+ If there's a feature missing from TuxOnIce that you'd like, feel free to
+ ask. We try to be obliging, within reason.
+
+ Patches are welcome. Please send to the list.
+
+10. How does it work?
+
+ TuxOnIce does its work in a number of steps.
+
+ a. Freezing system activity.
+
+ The first main stage in hibernating is to stop all other activity. This is
+ achieved in stages. Processes are considered in fours groups, which we will
+ describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE
+ flag, kernel threads without this flag, userspace processes with the
+ PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are
+ untouched by the refrigerator code. They are allowed to run during hibernating
+ and resuming, and are used to support user interaction, storage access or the
+ like. Other kernel threads (those unneeded while hibernating) are frozen last.
+ This leaves us with userspace processes that need to be frozen. When a
+ process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on
+ that process for the duration of that call. Processes that have this flag are
+ frozen after processes without it, so that we can seek to ensure that dirty
+ data is synced to disk as quickly as possible in a situation where other
+ processes may be submitting writes at the same time. Freezing the processes
+ that are submitting data stops new I/O from being submitted. Syncthreads can
+ then cleanly finish their work. So the order is:
+
+ - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE;
+ - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE);
+ - Kernel processes without PF_NOFREEZE.
+
+ b. Eating memory.
+
+ For a successful hibernation cycle, you need to have enough disk space to store the
+ image and enough memory for the various limitations of TuxOnIce's
+ algorithm. You can also specify a maximum image size. In order to attain
+ to those constraints, TuxOnIce may 'eat' memory. If, after freezing
+ processes, the constraints aren't met, TuxOnIce will thaw all the
+ other processes and begin to eat memory until its calculations indicate
+ the constraints are met. It will then freeze processes again and recheck
+ its calculations.
+
+ c. Allocation of storage.
+
+ Next, TuxOnIce allocates the storage that will be used to save
+ the image.
+
+ The core of TuxOnIce knows nothing about how or where pages are stored. We
+ therefore request the active allocator (remember you might have compiled in
+ more than one!) to allocate enough storage for our expect image size. If
+ this request cannot be fulfilled, we eat more memory and try again. If it
+ is fulfiled, we seek to allocate additional storage, just in case our
+ expected compression ratio (if any) isn't achieved. This time, however, we
+ just continue if we can't allocate enough storage.
+
+ If these calls to our allocator change the characteristics of the image
+ such that we haven't allocated enough memory, we also loop. (The allocator
+ may well need to allocate space for its storage information).
+
+ d. Write the first part of the image.
+
+ TuxOnIce stores the image in two sets of pages called 'pagesets'.
+ Pageset 2 contains pages on the active and inactive lists; essentially
+ the page cache. Pageset 1 contains all other pages, including the kernel.
+ We use two pagesets for one important reason: We need to make an atomic copy
+ of the kernel to ensure consistency of the image. Without a second pageset,
+ that would limit us to an image that was at most half the amount of memory
+ available. Using two pagesets allows us to store a full image. Since pageset
+ 2 pages won't be needed in saving pageset 1, we first save pageset 2 pages.
+ We can then make our atomic copy of the remaining pages using both pageset 2
+ pages and any other pages that are free. While saving both pagesets, we are
+ careful not to corrupt the image. Among other things, we use lowlevel block
+ I/O routines that don't change the pagecache contents.
+
+ The next step, then, is writing pageset 2.
+
+ e. Suspending drivers and storing processor context.
+
+ Having written pageset2, TuxOnIce calls the power management functions to
+ notify drivers of the hibernation, and saves the processor state in preparation
+ for the atomic copy of memory we are about to make.
+
+ f. Atomic copy.
+
+ At this stage, everything else but the TuxOnIce code is halted. Processes
+ are frozen or idling, drivers are quiesced and have stored (ideally and where
+ necessary) their configuration in memory we are about to atomically copy.
+ In our lowlevel architecture specific code, we have saved the CPU state.
+ We can therefore now do our atomic copy before resuming drivers etc.
+
+ g. Save the atomic copy (pageset 1).
+
+ TuxOnice can then write the atomic copy of the remaining pages. Since we
+ have copied the pages into other locations, we can continue to use the
+ normal block I/O routines without fear of corruption our image.
+
+ f. Save the image header.
+
+ Nearly there! We save our settings and other parameters needed for
+ reloading pageset 1 in an 'image header'. We also tell our allocator to
+ serialise its data at this stage, so that it can reread the image at resume
+ time.
+
+ g. Set the image header.
+
+ Finally, we edit the header at our resume= location. The signature is
+ changed by the allocator to reflect the fact that an image exists, and to
+ point to the start of that data if necessary (swap allocator).
+
+ h. Power down.
+
+ Or reboot if we're debugging and the appropriate option is selected.
+
+ Whew!
+
+ Reloading the image.
+ --------------------
+
+ Reloading the image is essentially the reverse of all the above. We load
+ our copy of pageset 1, being careful to choose locations that aren't going
+ to be overwritten as we copy it back (We start very early in the boot
+ process, so there are no other processes to quiesce here). We then copy
+ pageset 1 back to its original location in memory and restore the process
+ context. We are now running with the original kernel. Next, we reload the
+ pageset 2 pages, free the memory and swap used by TuxOnIce, restore
+ the pageset header and restart processes. Sounds easy in comparison to
+ hibernating, doesn't it!
+
+ There is of course more to TuxOnIce than this, but this explanation
+ should be a good start. If there's interest, I'll write further
+ documentation on range pages and the low level I/O.
+
+11. Who wrote TuxOnIce?
+
+ (Answer based on the writings of Florent Chabaud, credits in files and
+ Nigel's limited knowledge; apologies to anyone missed out!)
+
+ The main developers of TuxOnIce have been...
+
+ Gabor Kuti
+ Pavel Machek
+ Florent Chabaud
+ Bernard Blackham
+ Nigel Cunningham
+
+ Significant portions of swsusp, the code in the vanilla kernel which
+ TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should
+ also be expressed to him.
+
+ The above mentioned developers have been aided in their efforts by a host
+ of hundreds, if not thousands of testers and people who have submitted bug
+ fixes & suggestions. Of special note are the efforts of Michael Frank, who
+ had his computers repetitively hibernate and resume for literally tens of
+ thousands of cycles and developed scripts to stress the system and test
+ TuxOnIce far beyond the point most of us (Nigel included!) would consider
+ testing. His efforts have contributed as much to TuxOnIce as any of the
+ names above.
diff --git a/MAINTAINERS b/MAINTAINERS
index 2b349ba..d43cd3a 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5630,6 +5630,13 @@ S: Maintained
F: drivers/tc/
F: include/linux/tc.h
+TUXONICE (ENHANCED HIBERNATION)
+P: Nigel Cunningham
+M: ni...@tuxonice.net
+L: tuxonic...@tuxonice.net
+W: http://tuxonice.net
+S: Maintained
+
U14-34F SCSI DRIVER
P: Dario Ballabio
M: ballabi...@emc.com
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
include/linux/netlink.h | 1 +
kernel/power/Kconfig | 17 +
kernel/power/Makefile | 2 +
kernel/power/tuxonice_userui.c | 662 ++++++++++++++++++++++++++++++++++++++++
4 files changed, 682 insertions(+), 0 deletions(-)
create mode 100644 kernel/power/tuxonice_userui.c
diff --git a/include/linux/netlink.h b/include/linux/netlink.h
index d597e15..f220828 100644
--- a/include/linux/netlink.h
+++ b/include/linux/netlink.h
@@ -24,6 +24,7 @@
/* leave room for NETLINK_DM (DM Events) */
#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */
#define NETLINK_ECRYPTFS 19
+#define NETLINK_TOI_USERUI 20 /* TuxOnIce's userui */
#define NETLINK_TOI_USM 21 /* Userspace storage manager */
#define MAX_LINKS 32
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index d8e62b6..f669cdd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -227,6 +227,23 @@ menuconfig TOI_CORE
comment "No compression support available without Cryptoapi support."
depends on TOI_CORE && !CRYPTO
+ config TOI_USERUI
+ bool "Userspace User Interface support"
+ depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
+ default y
+ ---help---
+ This option enabled support for a userspace based user interface
+ to TuxOnIce, which allows you to have a nice display while suspending
+ and resuming, and also enables features such as pressing escape to
+ cancel a cycle or interactive debugging.
+
+ config TOI_USERUI_DEFAULT_PATH
+ string "Default userui program location"
+ default "/usr/local/sbin/tuxoniceui_text"
+ depends on TOI_USERUI
+ ---help---
+ This entry allows you to specify a default path to the userui binary.
+
config TOI_KEEP_IMAGE
bool "Allow Keep Image Mode"
depends on TOI_CORE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 9f6d06f..d67a242 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -20,6 +20,8 @@ obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
obj-$(CONFIG_TOI_SWAP) += tuxonice_block_io.o tuxonice_swap.o
obj-$(CONFIG_TOI_FILE) += tuxonice_block_io.o tuxonice_file.o
+obj-$(CONFIG_TOI_USERUI) += tuxonice_userui.o
+
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
new file mode 100644
index 0000000..c7b1053
--- /dev/null
+++ b/kernel/power/tuxonice_userui.c
@@ -0,0 +1,662 @@
+/*
+ * kernel/power/user_ui.c
+ *
+ * Copyright (C) 2005-2007 Bernard Blackham
+ * Copyright (C) 2002-2008 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * This file is released under the GPLv2.
+ *
+ * Routines for TuxOnIce's user interface.
+ *
+ * The user interface code talks to a userspace program via a
+ * netlink socket.
+ *
+ * The kernel side:
+ * - starts the userui program;
+ * - sends text messages and progress bar status;
+ *
+ * The user space side:
+ * - passes messages regarding user requests (abort, toggle reboot etc)
+ *
+ */
+
+#define __KERNEL_SYSCALLS__
+
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/console.h>
+#include <linux/ctype.h>
+#include <linux/tty.h>
+#include <linux/vt_kern.h>
+#include <linux/reboot.h>
+#include <linux/kmod.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
+
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_netlink.h"
+#include "tuxonice_power_off.h"
+
+static char local_printf_buf[1024]; /* Same as printk - should be safe */
+
+static struct user_helper_data ui_helper_data;
+static struct toi_module_ops userui_ops;
+static int orig_kmsg;
+
+static char lastheader[512];
+static int lastheader_message_len;
+static int ui_helper_changed; /* Used at resume-time so don't overwrite value
+ set from initrd/ramfs. */
+
+/* Number of distinct progress amounts that userspace can display */
+static int progress_granularity = 30;
+
+static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
+
+/**
+ * ui_nl_set_state - Update toi_action based on a message from userui.
+ *
+ * @n: The bit (1 << bit) to set.
+ */
+static void ui_nl_set_state(int n)
+{
+ /* Only let them change certain settings */
+ static const u32 toi_action_mask =
+ (1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
+ (1 << TOI_LOGALL) |
+ (1 << TOI_SINGLESTEP) |
+ (1 << TOI_PAUSE_NEAR_PAGESET_END);
+
+ toi_bkd.toi_action = (toi_bkd.toi_action & (~toi_action_mask)) |
+ (n & toi_action_mask);
+
+ if (!test_action_state(TOI_PAUSE) &&
+ !test_action_state(TOI_SINGLESTEP))
+ wake_up_interruptible(&userui_wait_for_key);
+}
+
+/**
+ * userui_post_atomic_restore - Tell userui that atomic restore just happened.
+ *
+ * Tell userui that atomic restore just occured, so that it can do things like
+ * redrawing the screen, re-getting settings and so on.
+ */
+static void userui_post_atomic_restore(void)
+{
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
+}
+
+/**
+ * userui_storage_needed - Report how much memory in image header is needed.
+ */
+static int userui_storage_needed(void)
+{
+ return sizeof(ui_helper_data.program) + 1 + sizeof(int);
+}
+
+/**
+ * userui_save_config_info - Fill buffer with config info for image header.
+ *
+ * @buf: Buffer into which to put the config info we want to save.
+ */
+static int userui_save_config_info(char *buf)
+{
+ *((int *) buf) = progress_granularity;
+ memcpy(buf + sizeof(int), ui_helper_data.program,
+ sizeof(ui_helper_data.program));
+ return sizeof(ui_helper_data.program) + sizeof(int) + 1;
+}
+
+/**
+ * userui_load_config_info - Restore config info from buffer.
+ *
+ * @buf: Buffer containing header info loaded.
+ * @size: Size of data loaded for this module.
+ */
+static void userui_load_config_info(char *buf, int size)
+{
+ progress_granularity = *((int *) buf);
+ size -= sizeof(int);
+
+ /* Don't load the saved path if one has already been set */
+ if (ui_helper_changed)
+ return;
+
+ if (size > sizeof(ui_helper_data.program))
+ size = sizeof(ui_helper_data.program);
+
+ memcpy(ui_helper_data.program, buf + sizeof(int), size);
+ ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
+}
+
+/**
+ * set_ui_program_set: Record that userui program was changed.
+ *
+ * Side effect routine for when the userui program is set. In an initrd or
+ * ramfs, the user may set a location for the userui program. If this happens,
+ * we don't want to reload the value that was saved in the image header. This
+ * routine allows us to flag that we shouldn't restore the program name from
+ * the image header.
+ */
+static void set_ui_program_set(void)
+{
+ ui_helper_changed = 1;
+}
+
+/**
+ * userui_memory_needed - Tell core how much memory to reserve for us.
+ */
+static int userui_memory_needed(void)
+{
+ /* ball park figure of 128 pages */
+ return 128 * PAGE_SIZE;
+}
+
+/**
+ * userui_update_status - Update the progress bar and (if on) in-bar message.
+ *
+ * @value: Current progress percentage numerator.
+ * @maximum: Current progress percentage denominator.
+ * @fmt: Message to be displayed in the middle of the progress bar.
+ *
+ * Note that a NULL message does not mean that any previous message is erased!
+ * For that, you need toi_prepare_status with clearbar on.
+ *
+ * Returns an unsigned long, being the next numerator (as determined by the
+ * maximum and progress granularity) where status needs to be updated.
+ * This is to reduce unnecessary calls to update_status.
+ */
+static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
+{
+ static u32 last_step = 9999;
+ struct userui_msg_params msg;
+ u32 this_step, next_update;
+ int bitshift;
+
+ if (ui_helper_data.pid == -1)
+ return 0;
+
+ if ((!maximum) || (!progress_granularity))
+ return maximum;
+
+ if (value < 0)
+ value = 0;
+
+ if (value > maximum)
+ value = maximum;
+
+ /* Try to avoid math problems - we can't do 64 bit math here
+ * (and shouldn't need it - anyone got screen resolution
+ * of 65536 pixels or more?) */
+ bitshift = fls(maximum) - 16;
+ if (bitshift > 0) {
+ u32 temp_maximum = maximum >> bitshift;
+ u32 temp_value = value >> bitshift;
+ this_step = (u32)
+ (temp_value * progress_granularity / temp_maximum);
+ next_update = (((this_step + 1) * temp_maximum /
+ progress_granularity) + 1) << bitshift;
+ } else {
+ this_step = (u32) (value * progress_granularity / maximum);
+ next_update = ((this_step + 1) * maximum /
+ progress_granularity) + 1;
+ }
+
+ if (this_step == last_step)
+ return next_update;
+
+ memset(&msg, 0, sizeof(msg));
+
+ msg.a = this_step;
+ msg.b = progress_granularity;
+
+ if (fmt) {
+ va_list args;
+ va_start(args, fmt);
+ vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+ va_end(args);
+ msg.text[sizeof(msg.text)-1] = '\0';
+ }
+
+ toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
+ &msg, sizeof(msg));
+ last_step = this_step;
+
+ return next_update;
+}
+
+/**
+ * userui_message - Display a message without necessarily logging it.
+ *
+ * @section: Type of message. Messages can be filtered by type.
+ * @level: Degree of importance of the message. Lower values = higher priority.
+ * @normally_logged: Whether logged even if log_everything is off.
+ * @fmt: Message (and parameters).
+ *
+ * This function is intended to do the same job as printk, but without normally
+ * logging what is printed. The point is to be able to get debugging info on
+ * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
+ *
+ * It may be called from an interrupt context - can't sleep!
+ */
+static void userui_message(u32 section, u32 level, u32 normally_logged,
+ const char *fmt, ...)
+{
+ struct userui_msg_params msg;
+
+ if ((level) && (level > console_loglevel))
+ return;
+
+ memset(&msg, 0, sizeof(msg));
+
+ msg.a = section;
+ msg.b = level;
+ msg.c = normally_logged;
+
+ if (fmt) {
+ va_list args;
+ va_start(args, fmt);
+ vsnprintf(msg.text, sizeof(msg.text), fmt, args);
+ va_end(args);
+ msg.text[sizeof(msg.text)-1] = '\0';
+ }
+
+ if (test_action_state(TOI_LOGALL))
+ printk(KERN_INFO "%s\n", msg.text);
+
+ toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
+ &msg, sizeof(msg));
+}
+
+/**
+ * wait_for_key_via_userui - Wait for userui to receive a keypress.
+ */
+static void wait_for_key_via_userui(void)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue(&userui_wait_for_key, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ interruptible_sleep_on(&userui_wait_for_key);
+
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&userui_wait_for_key, &wait);
+}
+
+/**
+ * userui_prepare_status - Display high level messages.
+ *
+ * @clearbar: Whether to clear the progress bar.
+ * @fmt...: New message for the title.
+ *
+ * Prepare the 'nice display', drawing the header and version, along with the
+ * current action and perhaps also resetting the progress bar.
+ */
+static void userui_prepare_status(int clearbar, const char *fmt, ...)
+{
+ va_list args;
+
+ if (fmt) {
+ va_start(args, fmt);
+ lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
+ va_end(args);
+ }
+
+ if (clearbar)
+ toi_update_status(0, 1, NULL);
+
+ if (ui_helper_data.pid == -1)
+ printk(KERN_EMERG "%s\n", lastheader);
+ else
+ toi_message(0, TOI_STATUS, 1, lastheader, NULL);
+}
+
+/**
+ * toi_wait_for_keypress - Wait for keypress via userui.
+ *
+ * @timeout: Maximum time to wait.
+ *
+ * Wait for a keypress from userui.
+ *
+ * FIXME: Implement timeout?
+ */
+static char userui_wait_for_keypress(int timeout)
+{
+ char key = '\0';
+
+ if (ui_helper_data.pid != -1) {
+ wait_for_key_via_userui();
+ key = ' ';
+ }
+
+ return key;
+}
+
+/**
+ * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
+ *
+ * @result_code: Reason why we're aborting (1 << bit).
+ * @fmt: Message to display if telling the user what's going on.
+ *
+ * Abort a cycle. If this wasn't at the user's request (and we're displaying
+ * output), tell the user why and wait for them to acknowledge the message.
+ */
+static void userui_abort_hibernate(int result_code, const char *fmt, ...)
+{
+ va_list args;
+ int printed_len = 0;
+
+ set_result_state(result_code);
+
+ if (test_result_state(TOI_ABORTED))
+ return;
+
+ set_result_state(TOI_ABORTED);
+
+ if (test_result_state(TOI_ABORT_REQUESTED))
+ return;
+
+ va_start(args, fmt);
+ printed_len = vsnprintf(local_printf_buf, sizeof(local_printf_buf),
+ fmt, args);
+ va_end(args);
+ if (ui_helper_data.pid != -1)
+ printed_len = sprintf(local_printf_buf + printed_len,
+ " (Press SPACE to continue)");
+
+ toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
+
+ if (ui_helper_data.pid != -1)
+ userui_wait_for_keypress(0);
+}
+
+/**
+ * request_abort_hibernate - Abort hibernating or resuming at user request.
+ *
+ * Handle the user requesting the cancellation of a hibernation or resume by
+ * pressing escape.
+ */
+static void request_abort_hibernate(void)
+{
+ if (test_result_state(TOI_ABORT_REQUESTED))
+ return;
+
+ if (test_toi_state(TOI_NOW_RESUMING)) {
+ toi_prepare_status(CLEAR_BAR, "Escape pressed. "
+ "Powering down again.");
+ set_toi_state(TOI_STOP_RESUME);
+ while (!test_toi_state(TOI_IO_STOPPED))
+ schedule();
+ if (toiActiveAllocator->mark_resume_attempted)
+ toiActiveAllocator->mark_resume_attempted(0);
+ toi_power_down();
+ }
+
+ toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
+ " ABORTING HIBERNATION ---");
+ set_abort_result(TOI_ABORT_REQUESTED);
+ wake_up_interruptible(&userui_wait_for_key);
+}
+
+/**
+ * userui_user_rcv_msg - Receive a netlink message from userui.
+ *
+ * @skb: skb received.
+ * @nlh: Netlink header received.
+ */
+static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int type;
+ int *data;
+
+ type = nlh->nlmsg_type;
+
+ /* A control message: ignore them */
+ if (type < NETLINK_MSG_BASE)
+ return 0;
+
+ /* Unknown message: reply with EINVAL */
+ if (type >= USERUI_MSG_MAX)
+ return -EINVAL;
+
+ /* All operations require privileges, even GET */
+ if (security_netlink_recv(skb, CAP_NET_ADMIN))
+ return -EPERM;
+
+ /* Only allow one task to receive NOFREEZE privileges */
+ if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
+ printk(KERN_INFO "Got NOFREEZE_ME request when "
+ "ui_helper_data.pid is %d.\n", ui_helper_data.pid);
+ return -EBUSY;
+ }
+
+ data = (int *) NLMSG_DATA(nlh);
+
+ switch (type) {
+ case USERUI_MSG_ABORT:
+ request_abort_hibernate();
+ return 0;
+ case USERUI_MSG_GET_STATE:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
+ sizeof(toi_bkd.toi_action));
+ return 0;
+ case USERUI_MSG_GET_DEBUG_STATE:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_DEBUG_STATE,
+ &toi_bkd.toi_debug_state,
+ sizeof(toi_bkd.toi_debug_state));
+ return 0;
+ case USERUI_MSG_SET_STATE:
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+ return -EINVAL;
+ ui_nl_set_state(*data);
+ return 0;
+ case USERUI_MSG_SET_DEBUG_STATE:
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+ return -EINVAL;
+ toi_bkd.toi_debug_state = (*data);
+ return 0;
+ case USERUI_MSG_SPACE:
+ wake_up_interruptible(&userui_wait_for_key);
+ return 0;
+ case USERUI_MSG_GET_POWERDOWN_METHOD:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_POWERDOWN_METHOD,
+ &toi_poweroff_method,
+ sizeof(toi_poweroff_method));
+ return 0;
+ case USERUI_MSG_SET_POWERDOWN_METHOD:
+ if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
+ return -EINVAL;
+ toi_poweroff_method = (unsigned long)(*data);
+ return 0;
+ case USERUI_MSG_GET_LOGLEVEL:
+ toi_send_netlink_message(&ui_helper_data,
+ USERUI_MSG_GET_LOGLEVEL,
+ &toi_bkd.toi_default_console_level,
+ sizeof(toi_bkd.toi_default_console_level));
+ return 0;
+ case USERUI_MSG_SET_LOGLEVEL:
+ if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
+ return -EINVAL;
+ toi_bkd.toi_default_console_level = (*data);
+ return 0;
+ case USERUI_MSG_PRINTK:
+ printk(KERN_INFO "%s", (char *) data);
+ return 0;
+ }
+
+ /* Unhandled here */
+ return 1;
+}
+
+/**
+ * userui_cond_pause - Possibly pause at user request.
+ *
+ * @pause: Whether to pause or just display the message.
+ * @message: Message to display at the start of pausing.
+ *
+ * Potentially pause and wait for the user to tell us to continue. We normally
+ * only pause when @pause is set. While paused, the user can do things like
+ * changing the loglevel, toggling the display of debugging sections and such
+ * like.
+ */
+static void userui_cond_pause(int pause, char *message)
+{
+ int displayed_message = 0, last_key = 0;
+
+ while (last_key != 32 &&
+ ui_helper_data.pid != -1 &&
+ ((test_action_state(TOI_PAUSE) && pause) ||
+ (test_action_state(TOI_SINGLESTEP)))) {
+ if (!displayed_message) {
+ toi_prepare_status(DONT_CLEAR_BAR,
+ "%s Press SPACE to continue.%s",
+ message ? message : "",
+ (test_action_state(TOI_SINGLESTEP)) ?
+ " Single step on." : "");
+ displayed_message = 1;
+ }
+ last_key = userui_wait_for_keypress(0);
+ }
+ schedule();
+}
+
+/**
+ * userui_prepare_console - Prepare the console for use.
+ *
+ * Prepare a console for use, saving current kmsg settings and attempting to
+ * start userui. Console loglevel changes are handled by userui.
+ */
+static void userui_prepare_console(void)
+{
+ orig_kmsg = kmsg_redirect;
+ kmsg_redirect = fg_console + 1;
+
+ ui_helper_data.pid = -1;
+
+ if (!userui_ops.enabled) {
+ printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
+ return;
+ }
+
+ if (*ui_helper_data.program)
+ toi_netlink_setup(&ui_helper_data);
+ else
+ printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
+}
+
+/**
+ * userui_cleanup_console - Cleanup after a cycle.
+ *
+ * Tell userui to cleanup, and restore kmsg_redirect to its original value.
+ */
+
+static void userui_cleanup_console(void)
+{
+ if (ui_helper_data.pid > -1)
+ toi_netlink_close(&ui_helper_data);
+
+ kmsg_redirect = orig_kmsg;
+}
+
+/*
+ * User interface specific /sys/power/tuxonice entries.
+ */
+
+static struct toi_sysfs_data sysfs_params[] = {
+#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
+ SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_CAN_CANCEL, 0),
+ SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
+ TOI_PAUSE, 0),
+ SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
+ SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
+ 2048, 0, NULL),
+ SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
+ set_ui_program_set),
+ SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
+#endif
+};
+
+static struct toi_module_ops userui_ops = {
+ .type = MISC_MODULE,
+ .name = "userui",
+ .shared_directory = "user_interface",
+ .module = THIS_MODULE,
+ .storage_needed = userui_storage_needed,
+ .save_config_info = userui_save_config_info,
+ .load_config_info = userui_load_config_info,
+ .memory_needed = userui_memory_needed,
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+static struct ui_ops my_ui_ops = {
+ .post_atomic_restore = userui_post_atomic_restore,
+ .update_status = userui_update_status,
+ .message = userui_message,
+ .prepare_status = userui_prepare_status,
+ .abort = userui_abort_hibernate,
+ .cond_pause = userui_cond_pause,
+ .prepare = userui_prepare_console,
+ .cleanup = userui_cleanup_console,
+ .wait_for_key = userui_wait_for_keypress,
+};
+
+/**
+ * toi_user_ui_init - Boot time initialisation for user interface.
+ *
+ * Invoked from the core init routine.
+ */
+static __init int toi_user_ui_init(void)
+{
+ int result;
+
+ ui_helper_data.nl = NULL;
+ strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
+ ui_helper_data.pid = -1;
+ ui_helper_data.skb_size = sizeof(struct userui_msg_params);
+ ui_helper_data.pool_limit = 6;
+ ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
+ ui_helper_data.name = "userspace ui";
+ ui_helper_data.rcv_msg = userui_user_rcv_msg;
+ ui_helper_data.interface_version = 8;
+ ui_helper_data.must_init = 0;
+ ui_helper_data.not_ready = userui_cleanup_console;
+ init_completion(&ui_helper_data.wait_for_process);
+ result = toi_register_module(&userui_ops);
+ if (!result)
+ result = toi_register_ui_ops(&my_ui_ops);
+ if (result)
+ toi_unregister_module(&userui_ops);
+
+ return result;
+}
+
+#ifdef MODULE
+/**
+ * toi_user_ui_ext - Cleanup code for if the core is unloaded.
+ */
+static __exit void toi_user_ui_exit(void)
+{
+ toi_netlink_close_complete(&ui_helper_data);
+ toi_remove_ui_ops(&my_ui_ops);
+ toi_unregister_module(&userui_ops);
+}
+
+module_init(toi_user_ui_init);
+module_exit(toi_user_ui_exit);
+MODULE_AUTHOR("Nigel Cunningham");
+MODULE_DESCRIPTION("TuxOnIce Userui Support");
+MODULE_LICENSE("GPL");
+#else
+late_initcall(toi_user_ui_init);
+#endif
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
include/linux/swap.h | 1 +
mm/page_alloc.c | 28 ++++++++++++++++++++++++++++
2 files changed, 29 insertions(+), 0 deletions(-)
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 62d8143..085cc14 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -168,6 +168,7 @@ struct swap_list_t {
extern unsigned long totalram_pages;
extern unsigned long totalreserve_pages;
extern unsigned int nr_free_buffer_pages(void);
+extern unsigned int nr_unallocated_buffer_pages(void);
extern unsigned int nr_free_pagecache_pages(void);
/* Definition of global_page_state not available yet */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e2f2699..95e753f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1810,6 +1810,26 @@ static unsigned int nr_free_zone_pages(int offset)
return sum;
}
+static unsigned int nr_unallocated_zone_pages(int offset)
+{
+ struct zoneref *z;
+ struct zone *zone;
+
+ /* Just pick one node, since fallback list is circular */
+ unsigned int sum = 0;
+
+ struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
+
+ for_each_zone_zonelist(zone, z, zonelist, offset) {
+ unsigned long high = zone->pages_high;
+ unsigned long left = zone_page_state(zone, NR_FREE_PAGES);
+ if (left > high)
+ sum += left - high;
+ }
+
+ return sum;
+}
+
/*
* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
*/
@@ -1820,6 +1840,14 @@ unsigned int nr_free_buffer_pages(void)
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
/*
+ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+ */
+unsigned int nr_unallocated_buffer_pages(void)
+{
+ return nr_unallocated_zone_pages(gfp_zone(GFP_USER));
+}
+
+/*
* Amount of free RAM allocatable within all zones
*/
unsigned int nr_free_pagecache_pages(void)
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/Kconfig | 9 +
kernel/power/Makefile | 2 +
kernel/power/tuxonice_file.c | 1235 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 1246 insertions(+), 0 deletions(-)
create mode 100644 kernel/power/tuxonice_file.c
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 1b474a6..51d57d4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -192,6 +192,15 @@ menuconfig TOI_CORE
comment "Image Storage (you need at least one allocator)"
depends on TOI_CORE
+ config TOI_FILE
+ bool "File Allocator"
+ depends on TOI_CORE
+ default y
+ ---help---
+ This option enables support for storing an image in a
+ simple file. This should be possible, but we're still
+ testing it.
+
comment "General Options"
depends on TOI_CORE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 3803866..41daada 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -17,6 +17,8 @@ endif
obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
+obj-$(CONFIG_TOI_FILE) += tuxonice_block_io.o tuxonice_file.o
+
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
obj-$(CONFIG_FREEZER) += process.o
diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
new file mode 100644
index 0000000..5ceb20a
--- /dev/null
+++ b/kernel/power/tuxonice_file.c
@@ -0,0 +1,1235 @@
+/*
+ * kernel/power/tuxonice_file.c
+ *
+ * Copyright (C) 2005-2008 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file encapsulates functions for usage of a simple file as a
+ * backing store. It is based upon the swapallocator, and shares the
+ * same basic working. Here, though, we have nothing to do with
+ * swapspace, and only one device to worry about.
+ *
+ * The user can just
+ *
+ * echo TuxOnIce > /path/to/my_file
+ *
+ * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
+ *
+ * and
+ *
+ * echo /path/to/my_file > /sys/power/tuxonice/file/target
+ *
+ * then put what they find in /sys/power/tuxonice/resume
+ * as their resume= parameter in lilo.conf (and rerun lilo if using it).
+ *
+ * Having done this, they're ready to hibernate and resume.
+ *
+ * TODO:
+ * - File resizing.
+ */
+
+#include <linux/suspend.h>
+#include <linux/blkdev.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/mount.h>
+#include <linux/statfs.h>
+#include <linux/syscalls.h>
+#include <linux/namei.h>
+#include <linux/fs.h>
+#include <linux/root_dev.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_io.h"
+#include "tuxonice_storage.h"
+#include "tuxonice_block_io.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+
+static struct toi_module_ops toi_fileops;
+
+/* Details of our target. */
+
+static char toi_file_target[256];
+static struct inode *target_inode;
+static struct file *target_file;
+static struct block_device *toi_file_target_bdev;
+static dev_t resume_file_dev_t;
+static int used_devt;
+static int setting_toi_file_target;
+static sector_t target_firstblock, target_header_start;
+static int target_storage_available;
+static int target_claim;
+
+/* Old signatures */
+static char HaveImage[] = "HaveImage\n";
+static char NoImage[] = "TuxOnIce\n";
+#define sig_size (sizeof(HaveImage) + 1)
+
+struct toi_file_header {
+ char sig[sig_size];
+ int resumed_before;
+ unsigned long first_header_block;
+ int have_image;
+};
+
+/* Header Page Information */
+static int header_pages_reserved;
+
+/* Main Storage Pages */
+static int main_pages_allocated, main_pages_requested;
+
+#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
+
+static struct toi_bdev_info devinfo;
+
+/* Extent chain for blocks */
+static struct hibernate_extent_chain block_chain;
+
+/* Signature operations */
+enum {
+ GET_IMAGE_EXISTS,
+ INVALIDATE,
+ MARK_RESUME_ATTEMPTED,
+ UNMARK_RESUME_ATTEMPTED,
+};
+
+/**
+ * set_devinfo - populate device information
+ * @bdev: Block device on which the file is.
+ * @target_blkbits: Number of bits in the page block size of the target
+ * file inode.
+ *
+ * Populate the devinfo structure about the target device.
+ *
+ * Background: a sector represents a fixed amount of data (generally 512 bytes).
+ * The hard drive sector size and the filesystem block size may be different.
+ * If fs_blksize mesures the filesystem block size and hd_blksize the hard drive
+ * sector size:
+ *
+ * sector << (fs_blksize - hd_blksize) converts hd sector into fs block
+ * fs_block >> (fs_blksize - hd_blksize) converts fs block into hd sector number
+ *
+ * Here target_blkbits == fs_blksize and hd_blksize == 9, hence:
+ *
+ * (fs_blksize - hd_blksize) == devinfo.bmap_shift
+ *
+ * The memory page size is defined by PAGE_SHIFT. devinfo.blocks_per_page is the
+ * number of filesystem blocks per memory page.
+ *
+ * Note that blocks are stored after >>. They are used after being <<.
+ * We always only use PAGE_SIZE aligned blocks.
+ *
+ * Side effects:
+ * devinfo.bdev, devinfo.bmap_shift and devinfo.blocks_per_page are set.
+ */
+static void set_devinfo(struct block_device *bdev, int target_blkbits)
+{
+ devinfo.bdev = bdev;
+ if (!target_blkbits) {
+ devinfo.bmap_shift = 0;
+ devinfo.blocks_per_page = 0;
+ } else {
+ /* We are assuming a hard disk with 512 (2^9) bytes/sector */
+ devinfo.bmap_shift = target_blkbits - 9;
+ devinfo.blocks_per_page = (1 << (PAGE_SHIFT - target_blkbits));
+ }
+}
+
+static long raw_to_real(long raw)
+{
+ long result;
+
+ result = raw - (raw * (sizeof(unsigned long) + sizeof(int)) +
+ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
+ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
+
+ return result < 0 ? 0 : result;
+}
+
+static int toi_file_storage_available(void)
+{
+ int result = 0;
+ struct block_device *bdev = toi_file_target_bdev;
+
+ if (!target_inode)
+ return 0;
+
+ switch (target_inode->i_mode & S_IFMT) {
+ case S_IFSOCK:
+ case S_IFCHR:
+ case S_IFIFO: /* Socket, Char, Fifo */
+ return -1;
+ case S_IFREG: /* Regular file: current size - holes + free
+ space on part */
+ result = target_storage_available;
+ break;
+ case S_IFBLK: /* Block device */
+ if (!bdev->bd_disk) {
+ printk(KERN_INFO "bdev->bd_disk null.\n");
+ return 0;
+ }
+
+ result = (bdev->bd_part ?
+ bdev->bd_part->nr_sects :
+ get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
+ }
+
+ return raw_to_real(result);
+}
+
+static int has_contiguous_blocks(int page_num)
+{
+ int j;
+ sector_t last = 0;
+
+ for (j = 0; j < devinfo.blocks_per_page; j++) {
+ sector_t this = bmap(target_inode,
+ page_num * devinfo.blocks_per_page + j);
+
+ if (!this || (last && (last + 1) != this))
+ break;
+
+ last = this;
+ }
+
+ return j == devinfo.blocks_per_page;
+}
+
+static int size_ignoring_ignored_pages(void)
+{
+ int mappable = 0, i;
+
+ if (!target_is_normal_file())
+ return toi_file_storage_available();
+
+ for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++)
+ if (has_contiguous_blocks(i))
+ mappable++;
+
+ return mappable;
+}
+
+/**
+ * __populate_block_list - add an extent to the chain
+ * @min: Start of the extent (first physical block = sector)
+ * @max: End of the extent (last physical block = sector)
+ *
+ * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
+ * fs block numbers.
+ **/
+static int __populate_block_list(int min, int max)
+{
+ if (test_action_state(TOI_TEST_BIO))
+ printk(KERN_INFO "Adding extent %d-%d.\n",
+ min << devinfo.bmap_shift,
+ ((max + 1) << devinfo.bmap_shift) - 1);
+
+ return toi_add_to_extent_chain(&block_chain, min, max);
+}
+
+static int apply_header_reservation(void)
+{
+ int i;
+
+ /* Apply header space reservation */
+ toi_extent_state_goto_start(&toi_writer_posn);
+
+ for (i = 0; i < header_pages_reserved; i++)
+ if (toi_bio_ops.forward_one_page(1, 0))
+ return -ENOSPC;
+
+ /* The end of header pages will be the start of pageset 2 */
+ toi_extent_state_save(&toi_writer_posn, &toi_writer_posn_save[2]);
+
+ return 0;
+}
+
+static int populate_block_list(void)
+{
+ int i, extent_min = -1, extent_max = -1, got_header = 0, result = 0;
+
+ if (block_chain.first)
+ toi_put_extent_chain(&block_chain);
+
+ if (!target_is_normal_file()) {
+ result = (target_storage_available > 0) ?
+ __populate_block_list(devinfo.blocks_per_page,
+ (target_storage_available + 1) *
+ devinfo.blocks_per_page - 1) : 0;
+ if (result)
+ return result;
+ goto out;
+ }
+
+ for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
+ sector_t new_sector;
+
+ if (!has_contiguous_blocks(i))
+ continue;
+
+ new_sector = bmap(target_inode, (i * devinfo.blocks_per_page));
+
+ /*
+ * Ignore the first block in the file.
+ * It gets the header.
+ */
+ if (new_sector == target_firstblock >> devinfo.bmap_shift) {
+ got_header = 1;
+ continue;
+ }
+
+ /*
+ * I'd love to be able to fill in holes and resize
+ * files, but not yet...
+ */
+
+ if (new_sector == extent_max + 1)
+ extent_max += devinfo.blocks_per_page;
+ else {
+ if (extent_min > -1) {
+ result = __populate_block_list(extent_min,
+ extent_max);
+ if (result)
+ return result;
+ }
+
+ extent_min = new_sector;
+ extent_max = extent_min +
+ devinfo.blocks_per_page - 1;
+ }
+ }
+
+ if (extent_min > -1) {
+ result = __populate_block_list(extent_min, extent_max);
+ if (result)
+ return result;
+ }
+
+out:
+ return apply_header_reservation();
+}
+
+static void toi_file_cleanup(int finishing_cycle)
+{
+ if (toi_file_target_bdev) {
+ if (target_claim) {
+ bd_release(toi_file_target_bdev);
+ target_claim = 0;
+ }
+
+ if (used_devt) {
+ blkdev_put(toi_file_target_bdev,
+ FMODE_READ | FMODE_NDELAY);
+ used_devt = 0;
+ }
+ toi_file_target_bdev = NULL;
+ target_inode = NULL;
+ set_devinfo(NULL, 0);
+ target_storage_available = 0;
+ }
+
+ if (target_file && !IS_ERR(target_file))
+ filp_close(target_file, NULL);
+
+ target_file = NULL;
+}
+
+/**
+ * reopen_resume_devt - reset the devinfo struct
+ *
+ * Having opened resume= once, we remember the major and
+ * minor nodes and use them to reopen the bdev for checking
+ * whether an image exists (possibly when starting a resume).
+ **/
+static void reopen_resume_devt(void)
+{
+ toi_file_target_bdev = toi_open_by_devnum(resume_file_dev_t,
+ FMODE_READ | FMODE_NDELAY);
+ if (IS_ERR(toi_file_target_bdev)) {
+ printk(KERN_INFO "Got a dev_num (%lx) but failed to open it.\n",
+ (unsigned long) resume_file_dev_t);
+ return;
+ }
+ target_inode = toi_file_target_bdev->bd_inode;
+ set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
+}
+
+static void toi_file_get_target_info(char *target, int get_size,
+ int resume_param)
+{
+ if (target_file)
+ toi_file_cleanup(0);
+
+ if (!target || !strlen(target))
+ return;
+
+ target_file = filp_open(target, O_RDONLY|O_LARGEFILE, 0);
+
+ if (IS_ERR(target_file) || !target_file) {
+
+ if (!resume_param) {
+ printk(KERN_INFO "Open file %s returned %p.\n",
+ target, target_file);
+ target_file = NULL;
+ return;
+ }
+
+ target_file = NULL;
+ wait_for_device_probe();
+ resume_file_dev_t = name_to_dev_t(target);
+ if (!resume_file_dev_t) {
+ struct kstat stat;
+ int error = vfs_stat(target, &stat);
+ printk(KERN_INFO "Open file %s returned %p and "
+ "name_to_devt failed.\n", target,
+ target_file);
+ if (error)
+ printk(KERN_INFO "Stating the file also failed."
+ " Nothing more we can do.\n");
+ else
+ resume_file_dev_t = stat.rdev;
+ return;
+ }
+
+ toi_file_target_bdev = toi_open_by_devnum(resume_file_dev_t,
+ FMODE_READ | FMODE_NDELAY);
+ if (IS_ERR(toi_file_target_bdev)) {
+ printk(KERN_INFO "Got a dev_num (%lx) but failed to "
+ "open it.\n",
+ (unsigned long) resume_file_dev_t);
+ return;
+ }
+ used_devt = 1;
+ target_inode = toi_file_target_bdev->bd_inode;
+ } else
+ target_inode = target_file->f_mapping->host;
+
+ if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
+ S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
+ printk(KERN_INFO "File support works with regular files,"
+ " character files and block devices.\n");
+ goto cleanup;
+ }
+
+ if (!used_devt) {
+ if (S_ISBLK(target_inode->i_mode)) {
+ toi_file_target_bdev = I_BDEV(target_inode);
+ if (!bd_claim(toi_file_target_bdev, &toi_fileops))
+ target_claim = 1;
+ } else
+ toi_file_target_bdev = target_inode->i_sb->s_bdev;
+ resume_file_dev_t = toi_file_target_bdev->bd_dev;
+ }
+
+ set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
+
+ if (get_size)
+ target_storage_available = size_ignoring_ignored_pages();
+
+ if (!resume_param)
+ target_firstblock = bmap(target_inode, 0) << devinfo.bmap_shift;
+
+ return;
+cleanup:
+ target_inode = NULL;
+ if (target_file) {
+ filp_close(target_file, NULL);
+ target_file = NULL;
+ }
+ set_devinfo(NULL, 0);
+ target_storage_available = 0;
+}
+
+static void toi_file_noresume_reset(void)
+{
+ toi_bio_ops.rw_cleanup(READ);
+}
+
+/**
+ * parse_signature - check if the file is suitable for resuming
+ * @header: Signature of the file
+ *
+ * Given a file header, check the content of the file. Return true if it
+ * contains a valid hibernate image.
+ * TOI_RESUMED_BEFORE is set accordingly.
+ **/
+static int parse_signature(struct toi_file_header *header)
+{
+ int have_image = !memcmp(HaveImage, header->sig, sizeof(HaveImage) - 1);
+ int no_image_header = !memcmp(NoImage, header->sig,
+ sizeof(NoImage) - 1);
+ int binary_sig = !memcmp(tuxonice_signature, header->sig,
+ sizeof(tuxonice_signature));
+
+ if (no_image_header || (binary_sig && !header->have_image))
+ return 0;
+
+ if (!have_image && !binary_sig)
+ return -1;
+
+ if (header->resumed_before)
+ set_toi_state(TOI_RESUMED_BEFORE);
+ else
+ clear_toi_state(TOI_RESUMED_BEFORE);
+
+ target_header_start = header->first_header_block;
+ return 1;
+}
+
+/**
+ * prepare_signature - populate the signature structure
+ * @current_header: Signature structure to populate
+ * @first_header_block: Sector with the header containing the extents
+ **/
+static int prepare_signature(struct toi_file_header *current_header,
+ unsigned long first_header_block)
+{
+ memcpy(current_header->sig, tuxonice_signature,
+ sizeof(tuxonice_signature));
+ current_header->resumed_before = 0;
+ current_header->first_header_block = first_header_block;
+ current_header->have_image = 1;
+ return 0;
+}
+
+static int toi_file_storage_allocated(void)
+{
+ if (!target_inode)
+ return 0;
+
+ if (target_is_normal_file())
+ return (int) raw_to_real(target_storage_available);
+ else
+ return (int) raw_to_real(main_pages_requested);
+}
+
+/**
+ * toi_file_release_storage - deallocate the block chain
+ **/
+static int toi_file_release_storage(void)
+{
+ toi_put_extent_chain(&block_chain);
+
+ header_pages_reserved = 0;
+ main_pages_allocated = 0;
+ main_pages_requested = 0;
+ return 0;
+}
+
+static void toi_file_reserve_header_space(int request)
+{
+ header_pages_reserved = request;
+}
+
+static int toi_file_allocate_storage(int main_space_requested)
+{
+ int result = 0;
+
+ int extra_pages = DIV_ROUND_UP(main_space_requested *
+ (sizeof(unsigned long) + sizeof(int)), PAGE_SIZE);
+ int pages_to_get = main_space_requested + extra_pages +
+ header_pages_reserved;
+ int blocks_to_get = pages_to_get - block_chain.size;
+
+ /* Only release_storage reduces the size */
+ if (blocks_to_get < 1)
+ return apply_header_reservation();
+
+ result = populate_block_list();
+
+ if (result)
+ return result;
+
+ toi_message(TOI_WRITER, TOI_MEDIUM, 0,
+ "Finished with block_chain.size == %d.\n",
+ block_chain.size);
+
+ if (block_chain.size < pages_to_get) {
+ printk(KERN_INFO "Block chain size (%d) < header pages (%d) + "
+ "extra pages (%d) + main pages (%d) (=%d "
+ "pages).\n",
+ block_chain.size, header_pages_reserved,
+ extra_pages, main_space_requested,
+ pages_to_get);
+ result = -ENOSPC;
+ }
+
+ main_pages_requested = main_space_requested;
+ main_pages_allocated = main_space_requested + extra_pages;
+ return result;
+}
+
+/**
+ * toi_file_write_header_init - save the header on the image
+ **/
+static int toi_file_write_header_init(void)
+{
+ int result;
+
+ toi_bio_ops.rw_init(WRITE, 0);
+ toi_writer_buffer_posn = 0;
+
+ /* Info needed to bootstrap goes at the start of the header.
+ * First we save the basic info needed for reading, including the number
+ * of header pages. Then we save the structs containing data needed
+ * for reading the header pages back.
+ * Note that even if header pages take more than one page, when we
+ * read back the info, we will have restored the location of the
+ * next header page by the time we go to use it.
+ */
+
+ result = toi_bio_ops.rw_header_chunk(WRITE, &toi_fileops,
+ (char *) &toi_writer_posn_save,
+ sizeof(toi_writer_posn_save));
+
+ if (result)
+ return result;
+
+ result = toi_bio_ops.rw_header_chunk(WRITE, &toi_fileops,
+ (char *) &devinfo, sizeof(devinfo));
+
+ if (result)
+ return result;
+
+ /* Flush the chain */
+ toi_serialise_extent_chain(&toi_fileops, &block_chain);
+
+ return 0;
+}
+
+static int toi_file_write_header_cleanup(void)
+{
+ struct toi_file_header *header;
+ int result, result2;
+ unsigned long sig_page = toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
+
+ /* Write any unsaved data */
+ result = toi_bio_ops.write_header_chunk_finish();
+
+ if (result)
+ goto out;
+
+ toi_extent_state_goto_start(&toi_writer_posn);
+ toi_bio_ops.forward_one_page(1, 1);
+
+ /* Adjust image header */
+ result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
+ target_firstblock,
+ virt_to_page(sig_page));
+ if (result)
+ goto out;
+
+ header = (struct toi_file_header *) sig_page;
+
+ prepare_signature(header,
+ toi_writer_posn.current_offset <<
+ devinfo.bmap_shift);
+
+ result = toi_bio_ops.bdev_page_io(WRITE, toi_file_target_bdev,
+ target_firstblock,
+ virt_to_page(sig_page));
+
+out:
+ result2 = toi_bio_ops.finish_all_io();
+ toi_free_page(38, sig_page);
+
+ return result ? result : result2;
+}
+
+/* HEADER READING */
+
+/**
+ * toi_file_read_header_init - check content of signature
+ *
+ * Entry point of the resume path.
+ * 1. Attempt to read the device specified with resume=.
+ * 2. Check the contents of the header for our signature.
+ * 3. Warn, ignore, reset and/or continue as appropriate.
+ * 4. If continuing, read the toi_file configuration section
+ * of the header and set up block device info so we can read
+ * the rest of the header & image.
+ *
+ * Returns:
+ * May not return if user choose to reboot at a warning.
+ * -EINVAL if cannot resume at this time. Booting should continue
+ * normally.
+ **/
+static int toi_file_read_header_init(void)
+{
+ int result;
+ struct block_device *tmp;
+
+ /* Allocate toi_writer_buffer */
+ toi_bio_ops.read_header_init();
+
+ /*
+ * Read toi_file configuration (header containing metadata).
+ * target_header_start is the first sector of the header. It has been
+ * set when checking if the file was suitable for resuming, see
+ * do_toi_step(STEP_RESUME_CAN_RESUME).
+ */
+ result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
+ target_header_start,
+ virt_to_page((unsigned long) toi_writer_buffer));
+
+ if (result) {
+ printk(KERN_ERR "FileAllocator read header init: Failed to "
+ "initialise reading the first page of data.\n");
+ toi_bio_ops.rw_cleanup(READ);
+ return result;
+ }
+
+ /* toi_writer_posn_save[0] contains the header */
+ memcpy(&toi_writer_posn_save, toi_writer_buffer,
+ sizeof(toi_writer_posn_save));
+
+ /* Save the position in the buffer */
+ toi_writer_buffer_posn = sizeof(toi_writer_posn_save);
+
+ tmp = devinfo.bdev;
+
+ /* See tuxonice_block_io.h */
+ memcpy(&devinfo,
+ toi_writer_buffer + toi_writer_buffer_posn,
+ sizeof(devinfo));
+
+ devinfo.bdev = tmp;
+ toi_writer_buffer_posn += sizeof(devinfo);
+
+ /* Reinitialize the extent pointer */
+ toi_extent_state_goto_start(&toi_writer_posn);
+ /* Jump to the next page */
+ toi_bio_ops.set_extra_page_forward();
+
+ /* Bring back the chain from disk: this will read
+ * all extents.
+ */
+ return toi_load_extent_chain(&block_chain);
+}
+
+static int toi_file_read_header_cleanup(void)
+{
+ toi_bio_ops.rw_cleanup(READ);
+ return 0;
+}
+
+/**
+ * toi_file_signature_op - perform an operation on the file signature
+ * @op: operation to perform
+ *
+ * op is either GET_IMAGE_EXISTS, INVALIDATE, MARK_RESUME_ATTEMPTED or
+ * UNMARK_RESUME_ATTEMPTED.
+ * If the signature is changed, an I/O operation is performed.
+ * The signature exists iff toi_file_signature_op(GET_IMAGE_EXISTS)>-1.
+ **/
+static int toi_file_signature_op(int op)
+{
+ char *cur;
+ int result = 0, result2, changed = 0;
+ struct toi_file_header *header;
+
+ if (!toi_file_target_bdev || IS_ERR(toi_file_target_bdev))
+ return -1;
+
+ cur = (char *) toi_get_zeroed_page(17, TOI_ATOMIC_GFP);
+ if (!cur) {
+ printk(KERN_INFO "Unable to allocate a page for reading the "
+ "image signature.\n");
+ return -ENOMEM;
+ }
+
+ result = toi_bio_ops.bdev_page_io(READ, toi_file_target_bdev,
+ target_firstblock,
+ virt_to_page(cur));
+
+ if (result)
+ goto out;
+
+ header = (struct toi_file_header *) cur;
+ result = parse_signature(header);
+
+ switch (op) {
+ case INVALIDATE:
+ if (result == -1)
+ goto out;
+
+ memcpy(header->sig, tuxonice_signature,
+ sizeof(tuxonice_signature));
+ header->resumed_before = 0;
+ header->have_image = 0;
+ result = 1;
+ changed = 1;
+ break;
+ case MARK_RESUME_ATTEMPTED:
+ if (result == 1) {
+ header->resumed_before = 1;
+ changed = 1;
+ }
+ break;
+ case UNMARK_RESUME_ATTEMPTED:
+ if (result == 1) {
+ header->resumed_before = 0;
+ changed = 1;
+ }
+ break;
+ }
+
+ if (changed) {
+ int io_result = toi_bio_ops.bdev_page_io(WRITE,
+ toi_file_target_bdev, target_firstblock,
+ virt_to_page(cur));
+ if (io_result)
+ result = io_result;
+ }
+
+out:
+ result2 = toi_bio_ops.finish_all_io();
+ toi_free_page(17, (unsigned long) cur);
+ return result ? result : result2;
+}
+
+/**
+ * toi_file_print_debug_stats - print debug info
+ * @buffer: Buffer to data to populate
+ * @size: Size of the buffer
+ **/
+static int toi_file_print_debug_stats(char *buffer, int size)
+{
+ int len = 0;
+
+ if (toiActiveAllocator != &toi_fileops) {
+ len = scnprintf(buffer, size,
+ "- FileAllocator inactive.\n");
+ return len;
+ }
+
+ len = scnprintf(buffer, size, "- FileAllocator active.\n");
+
+ len += scnprintf(buffer+len, size-len, " Storage available for "
+ "image: %d pages.\n",
+ toi_file_storage_allocated());
+
+ return len;
+}
+
+/**
+ * toi_file_storage_needed - storage needed
+ *
+ * Returns amount of space in the image header required
+ * for the toi_file's data.
+ *
+ * We ensure the space is allocated, but actually save the
+ * data from write_header_init and therefore don't also define a
+ * save_config_info routine.
+ **/
+static int toi_file_storage_needed(void)
+{
+ return strlen(toi_file_target) + 1 +
+ sizeof(toi_writer_posn_save) +
+ sizeof(devinfo) +
+ 2 * sizeof(int) +
+ (2 * sizeof(unsigned long) * block_chain.num_extents);
+}
+
+/**
+ * toi_file_remove_image - invalidate the image
+ **/
+static int toi_file_remove_image(void)
+{
+ toi_file_release_storage();
+ return toi_file_signature_op(INVALIDATE);
+}
+
+/**
+ * toi_file_image_exists - test if an image exists
+ *
+ * Repopulate toi_file_target_bdev if needed.
+ **/
+static int toi_file_image_exists(int quiet)
+{
+ if (!toi_file_target_bdev)
+ reopen_resume_devt();
+ return toi_file_signature_op(GET_IMAGE_EXISTS);
+}
+
+/**
+ * toi_file_mark_resume_attempted - mark resume attempted if so
+ * @mark: attempted flag
+ *
+ * Record that we tried to resume from this image. Resuming
+ * multiple times from the same image may be dangerous
+ * (possible filesystem corruption).
+ **/
+static int toi_file_mark_resume_attempted(int mark)
+{
+ return toi_file_signature_op(mark ? MARK_RESUME_ATTEMPTED :
+ UNMARK_RESUME_ATTEMPTED);
+}
+
+/**
+ * toi_file_set_resume_param - validate the specified resume file
+ *
+ * Given a target filename, populate the resume parameter. This is
+ * meant to be used by the user to populate the kernel command line.
+ * By setting /sys/power/tuxonice/file/target, the valid resume
+ * parameter to use is set and accessible through
+ * /sys/power/tuxonice/resume.
+ *
+ * If the file could be located, we check if it contains a valid
+ * signature.
+ **/
+static void toi_file_set_resume_param(void)
+{
+ char *buffer = (char *) toi_get_zeroed_page(18, TOI_ATOMIC_GFP);
+ char *buffer2 = (char *) toi_get_zeroed_page(19, TOI_ATOMIC_GFP);
+ unsigned long sector = bmap(target_inode, 0);
+ int offset = 0;
+
+ if (!buffer || !buffer2) {
+ if (buffer)
+ toi_free_page(18, (unsigned long) buffer);
+ if (buffer2)
+ toi_free_page(19, (unsigned long) buffer2);
+ printk(KERN_ERR "TuxOnIce: Failed to allocate memory while "
+ "setting resume= parameter.\n");
+ return;
+ }
+
+ if (toi_file_target_bdev) {
+ set_devinfo(toi_file_target_bdev, target_inode->i_blkbits);
+
+ bdevname(toi_file_target_bdev, buffer2);
+ offset += snprintf(buffer + offset, PAGE_SIZE - offset,
+ "/dev/%s", buffer2);
+
+ if (sector)
+ /* The offset is: sector << (inode->i_blkbits - 9) */
+ offset += snprintf(buffer + offset, PAGE_SIZE - offset,
+ ":0x%lx", sector << devinfo.bmap_shift);
+ } else
+ offset += snprintf(buffer + offset, PAGE_SIZE - offset,
+ "%s is not a valid target.", toi_file_target);
+
+ sprintf(resume_file, "file:%s", buffer);
+
+ toi_free_page(18, (unsigned long) buffer);
+ toi_free_page(19, (unsigned long) buffer2);
+
+ toi_attempt_to_parse_resume_device(1);
+}
+
+/**
+ * __test_toi_file_target - is the file target valid for hibernating?
+ * @target: target file
+ * @resume_param: whether resume= has been specified
+ * @quiet: quiet flag
+ *
+ * Test whether the file target can be used for hibernating: valid target
+ * and signature.
+ * The resume parameter is set if needed.
+ **/
+static int __test_toi_file_target(char *target, int resume_param, int quiet)
+{
+ toi_file_get_target_info(target, 0, resume_param);
+ if (toi_file_signature_op(GET_IMAGE_EXISTS) > -1) {
+ if (!quiet)
+ printk(KERN_INFO "TuxOnIce: FileAllocator: File "
+ "signature found.\n");
+ if (!resume_param)
+ toi_file_set_resume_param();
+
+ toi_bio_ops.set_devinfo(&devinfo);
+ toi_writer_posn.chains = &block_chain;
+ toi_writer_posn.num_chains = 1;
+
+ if (!resume_param)
+ set_toi_state(TOI_CAN_HIBERNATE);
+ return 0;
+ }
+
+ /*
+ * Target unaccessible or no signature found
+ * Most errors have already been reported
+ */
+
+ clear_toi_state(TOI_CAN_HIBERNATE);
+
+ if (quiet)
+ return 1;
+
+ if (*target)
+ printk(KERN_INFO "TuxOnIce: FileAllocator: Sorry. No signature "
+ "found at %s.\n", target);
+ else
+ if (!resume_param)
+ printk(KERN_INFO "TuxOnIce: FileAllocator: Sorry. "
+ "Target is not set for hibernating.\n");
+
+ return 1;
+}
+
+/**
+ * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
+ *
+ * Test wheter the target file is valid for hibernating.
+ **/
+static void test_toi_file_target(void)
+{
+ setting_toi_file_target = 1;
+
+ printk(KERN_INFO "TuxOnIce: Hibernating %sabled.\n",
+ __test_toi_file_target(toi_file_target, 0, 1) ?
+ "dis" : "en");
+
+ setting_toi_file_target = 0;
+}
+
+/**
+ * toi_file_parse_sig_location - parse image Location
+ * @commandline: the resume parameter
+ * @only_writer: ??
+ * @quiet: quiet flag
+ *
+ * Attempt to parse a resume= parameter.
+ * File Allocator accepts:
+ * resume=file:DEVNAME[:FIRSTBLOCK]
+ *
+ * Where:
+ * DEVNAME is convertable to a dev_t by name_to_dev_t
+ * FIRSTBLOCK is the location of the first block in the file.
+ * BLOCKSIZE is the logical blocksize >= SECTOR_SIZE &
+ * <= PAGE_SIZE,
+ * mod SECTOR_SIZE == 0 of the device.
+ *
+ * Data is validated by attempting to read a header from the
+ * location given. Failure will result in toi_file refusing to
+ * save an image, and a reboot with correct parameters will be
+ * necessary.
+ **/
+static int toi_file_parse_sig_location(char *commandline,
+ int only_writer, int quiet)
+{
+ char *thischar, *devstart = NULL, *colon = NULL, *at_symbol = NULL;
+ int result = -EINVAL, target_blocksize = 0;
+
+ if (strncmp(commandline, "file:", 5)) {
+ if (!only_writer)
+ return 1;
+ } else
+ commandline += 5;
+
+ /*
+ * Don't check signature again if we're beginning a cycle. If we already
+ * did the initialisation successfully, assume we'll be okay when it
+ * comes to resuming.
+ */
+ if (toi_file_target_bdev)
+ return 0;
+
+ devstart = commandline;
+ thischar = commandline;
+ while ((*thischar != ':') && (*thischar != '@') &&
+ ((thischar - commandline) < 250) && (*thischar))
+ thischar++;
+
+ if (*thischar == ':') {
+ colon = thischar;
+ *colon = 0;
+ thischar++;
+ }
+
+ while ((*thischar != '@') && ((thischar - commandline) < 250)
+ && (*thischar))
+ thischar++;
+
+ if (*thischar == '@') {
+ at_symbol = thischar;
+ *at_symbol = 0;
+ }
+
+ /*
+ * For the toi_file, you can be able to resume, but not hibernate,
+ * because the resume= is set correctly, but the toi_file_target
+ * isn't.
+ *
+ * We may have come here as a result of setting resume or
+ * toi_file_target. We only test the toi_file target in the
+ * former case (it's already done in the later), and we do it before
+ * setting the block number ourselves. It will overwrite the values
+ * given on the command line if we don't.
+ */
+
+ if (!setting_toi_file_target) /* Concurrent write via /sys? */
+ __test_toi_file_target(toi_file_target, 1, 0);
+
+ if (colon) {
+ unsigned long block;
+ result = strict_strtoul(colon + 1, 0, &block);
+ if (result)
+ goto out;
+ target_firstblock = (int) block;
+ } else
+ target_firstblock = 0;
+
+ if (at_symbol) {
+ unsigned long block_size;
+ result = strict_strtoul(at_symbol + 1, 0, &block_size);
+ if (result)
+ goto out;
+ target_blocksize = (int) block_size;
+ if (target_blocksize & (SECTOR_SIZE - 1)) {
+ printk(KERN_INFO "FileAllocator: Blocksizes are "
+ "multiples of %d.\n", SECTOR_SIZE);
+ result = -EINVAL;
+ goto out;
+ }
+ }
+
+ if (!quiet)
+ printk(KERN_INFO "TuxOnIce FileAllocator: Testing whether you "
+ "can resume:\n");
+
+ toi_file_get_target_info(commandline, 0, 1);
+
+ if (!toi_file_target_bdev || IS_ERR(toi_file_target_bdev)) {
+ toi_file_target_bdev = NULL;
+ result = -1;
+ goto out;
+ }
+
+ if (target_blocksize)
+ set_devinfo(toi_file_target_bdev, ffs(target_blocksize));
+
+ result = __test_toi_file_target(commandline, 1, quiet);
+
+out:
+ if (result)
+ clear_toi_state(TOI_CAN_HIBERNATE);
+
+ if (!quiet)
+ printk(KERN_INFO "Resuming %sabled.\n", result ? "dis" : "en");
+
+ if (colon)
+ *colon = ':';
+ if (at_symbol)
+ *at_symbol = '@';
+
+ return result;
+}
+
+/**
+ * toi_file_save_config_info - populate toi_file_target
+ * @buffer: Pointer to a buffer of size PAGE_SIZE.
+ *
+ * Save the target's name, not for resume time, but for
+ * all_settings.
+ * Returns:
+ * Number of bytes used for saving our data.
+ **/
+static int toi_file_save_config_info(char *buffer)
+{
+ strcpy(buffer, toi_file_target);
+ return strlen(toi_file_target) + 1;
+}
+
+/**
+ * toi_file_load_config_info - reload target's name
+ * @buffer: pointer to the start of the data
+ * @size: number of bytes that were saved
+ *
+ * toi_file_target is set to buffer.
+ **/
+static void toi_file_load_config_info(char *buffer, int size)
+{
+ strlcpy(toi_file_target, buffer, size);
+}
+
+static int toi_file_initialise(int starting_cycle)
+{
+ if (starting_cycle) {
+ if (toiActiveAllocator != &toi_fileops)
+ return 0;
+
+ if (starting_cycle & SYSFS_HIBERNATE && !*toi_file_target) {
+ printk(KERN_INFO "FileAllocator is the active writer, "
+ "but no filename has been set.\n");
+ return 1;
+ }
+ }
+
+ if (*toi_file_target)
+ toi_file_get_target_info(toi_file_target, starting_cycle, 0);
+
+ if (starting_cycle && (toi_file_image_exists(1) == -1)) {
+ printk("%s is does not have a valid signature for "
+ "hibernating.\n", toi_file_target);
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+
+ SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
+ SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0,
+ attempt_to_parse_resume_device2)
+};
+
+static struct toi_module_ops toi_fileops = {
+ .type = WRITER_MODULE,
+ .name = "file storage",
+ .directory = "file",
+ .module = THIS_MODULE,
+ .print_debug_info = toi_file_print_debug_stats,
+ .save_config_info = toi_file_save_config_info,
+ .load_config_info = toi_file_load_config_info,
+ .storage_needed = toi_file_storage_needed,
+ .initialise = toi_file_initialise,
+ .cleanup = toi_file_cleanup,
+
+ .noresume_reset = toi_file_noresume_reset,
+ .storage_available = toi_file_storage_available,
+ .storage_allocated = toi_file_storage_allocated,
+ .reserve_header_space = toi_file_reserve_header_space,
+ .allocate_storage = toi_file_allocate_storage,
+ .image_exists = toi_file_image_exists,
+ .mark_resume_attempted = toi_file_mark_resume_attempted,
+ .write_header_init = toi_file_write_header_init,
+ .write_header_cleanup = toi_file_write_header_cleanup,
+ .read_header_init = toi_file_read_header_init,
+ .read_header_cleanup = toi_file_read_header_cleanup,
+ .remove_image = toi_file_remove_image,
+ .parse_sig_location = toi_file_parse_sig_location,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+static __init int toi_file_load(void)
+{
+ toi_fileops.rw_init = toi_bio_ops.rw_init;
+ toi_fileops.rw_cleanup = toi_bio_ops.rw_cleanup;
+ toi_fileops.read_page = toi_bio_ops.read_page;
+ toi_fileops.write_page = toi_bio_ops.write_page;
+ toi_fileops.rw_header_chunk = toi_bio_ops.rw_header_chunk;
+ toi_fileops.rw_header_chunk_noreadahead =
+ toi_bio_ops.rw_header_chunk_noreadahead;
+ toi_fileops.io_flusher = toi_bio_ops.io_flusher;
+ toi_fileops.update_throughput_throttle =
+ toi_bio_ops.update_throughput_throttle;
+ toi_fileops.finish_all_io = toi_bio_ops.finish_all_io;
+
+ return toi_register_module(&toi_fileops);
+}
+
+late_initcall(toi_file_load);
Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
---
kernel/power/Kconfig | 8 +
kernel/power/Makefile | 1 +
kernel/power/tuxonice_swap.c | 1319 ++++++++++++++++++++++++++++++++++++++++++
3 files changed, 1328 insertions(+), 0 deletions(-)
create mode 100644 kernel/power/tuxonice_swap.c
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 51d57d4..d8e62b6 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -201,6 +201,14 @@ menuconfig TOI_CORE
simple file. This should be possible, but we're still
testing it.
+ config TOI_SWAP
+ bool "Swap Allocator"
+ depends on TOI_CORE && SWAP
+ default y
+ ---help---
+ This option enables support for storing an image in your
+ swap space.
+
comment "General Options"
depends on TOI_CORE
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 41daada..9f6d06f 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -17,6 +17,7 @@ endif
obj-$(CONFIG_TOI_CORE) += tuxonice_core.o
obj-$(CONFIG_TOI_CRYPTO) += tuxonice_compress.o
+obj-$(CONFIG_TOI_SWAP) += tuxonice_block_io.o tuxonice_swap.o
obj-$(CONFIG_TOI_FILE) += tuxonice_block_io.o tuxonice_file.o
obj-$(CONFIG_PM) += main.o
diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
new file mode 100644
index 0000000..420edec
--- /dev/null
+++ b/kernel/power/tuxonice_swap.c
@@ -0,0 +1,1319 @@
+/*
+ * kernel/power/tuxonice_swap.c
+ *
+ * Copyright (C) 2004-2008 Nigel Cunningham (nigel at tuxonice net)
+ *
+ * Distributed under GPLv2.
+ *
+ * This file encapsulates functions for usage of swap space as a
+ * backing store.
+ */
+
+#include <linux/suspend.h>
+#include <linux/blkdev.h>
+#include <linux/swapops.h>
+#include <linux/swap.h>
+#include <linux/syscalls.h>
+
+#include "tuxonice.h"
+#include "tuxonice_sysfs.h"
+#include "tuxonice_modules.h"
+#include "tuxonice_io.h"
+#include "tuxonice_ui.h"
+#include "tuxonice_extent.h"
+#include "tuxonice_block_io.h"
+#include "tuxonice_alloc.h"
+#include "tuxonice_builtin.h"
+
+static struct toi_module_ops toi_swapops;
+
+/* --- Struct of pages stored on disk */
+
+struct sig_data {
+ dev_t device;
+ unsigned long sector;
+ int resume_attempted;
+ int orig_sig_type;
+};
+
+union diskpage {
+ union swap_header swh; /* swh.magic is the only member used */
+ struct sig_data sig_data;
+};
+
+union p_diskpage {
+ union diskpage *pointer;
+ char *ptr;
+ unsigned long address;
+};
+
+enum {
+ IMAGE_SIGNATURE,
+ NO_IMAGE_SIGNATURE,
+ TRIED_RESUME,
+ NO_TRIED_RESUME,
+};
+
+/*
+ * Both of these point to versions of the swap header page. original_sig points
+ * to the data we read from disk at the start of hibernating or checking whether
+ * to resume. no_image is the page stored in the image header, showing what the
+ * swap header page looked like at the start of hibernating.
+ */
+static char *current_signature_page;
+static char no_image_signature_contents[sizeof(struct sig_data)];
+
+/* Devices used for swap */
+static struct toi_bdev_info devinfo[MAX_SWAPFILES];
+
+/* Extent chains for swap & blocks */
+static struct hibernate_extent_chain swapextents;
+static struct hibernate_extent_chain block_chain[MAX_SWAPFILES];
+
+static dev_t header_dev_t;
+static struct block_device *header_block_device;
+static unsigned long headerblock;
+
+/* For swapfile automatically swapon/off'd. */
+static char swapfilename[32] = "";
+static int toi_swapon_status;
+
+/* Header Page Information */
+static long header_pages_reserved;
+
+/* Swap Pages */
+static long swap_pages_allocated;
+
+/* User Specified Parameters. */
+
+static unsigned long resume_firstblock;
+static dev_t resume_swap_dev_t;
+static struct block_device *resume_block_device;
+
+static struct sysinfo swapinfo;
+
+/* Block devices open. */
+struct bdev_opened {
+ dev_t device;
+ struct block_device *bdev;
+};
+
+/*
+ * Entry MAX_SWAPFILES is the resume block device, which may
+ * be a swap device not enabled when we hibernate.
+ * Entry MAX_SWAPFILES + 1 is the header block device, which
+ * is needed before we find out which slot it occupies.
+ *
+ * We use a separate struct to devInfo so that we can track
+ * the bdevs we open, because if we need to abort resuming
+ * prior to the atomic restore, they need to be closed, but
+ * closing them after sucessfully resuming would be wrong.
+ */
+static struct bdev_opened *bdevs_opened[MAX_SWAPFILES + 2];
+
+/**
+ * close_bdev: Close a swap bdev.
+ *
+ * int: The swap entry number to close.
+ */
+static void close_bdev(int i)
+{
+ struct bdev_opened *this = bdevs_opened[i];
+
+ if (!this)
+ return;
+
+ blkdev_put(this->bdev, FMODE_READ | FMODE_NDELAY);
+ toi_kfree(8, this, sizeof(*this));
+ bdevs_opened[i] = NULL;
+}
+
+/**
+ * close_bdevs: Close all bdevs we opened.
+ *
+ * Close all bdevs that we opened and reset the related vars.
+ */
+static void close_bdevs(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_SWAPFILES + 2; i++)
+ close_bdev(i);
+
+ resume_block_device = NULL;
+ header_block_device = NULL;
+}
+
+/**
+ * open_bdev: Open a bdev at resume time.
+ *
+ * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
+ * (the user can have resume= pointing at a swap partition/file that isn't
+ * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
+ * header. It will be from a swap partition that was enabled when we hibernated,
+ * but we don't know it's real index until we read that first page.
+ * dev_t: The device major/minor.
+ * display_errs: Whether to try to do this quietly.
+ *
+ * We stored a dev_t in the image header. Open the matching device without
+ * requiring /dev/<whatever> in most cases and record the details needed
+ * to close it later and avoid duplicating work.
+ */
+static struct block_device *open_bdev(int index, dev_t device, int display_errs)
+{
+ struct bdev_opened *this;
+ struct block_device *bdev;
+
+ if (bdevs_opened[index]) {
+ if (bdevs_opened[index]->device == device)
+ return bdevs_opened[index]->bdev;
+
+ close_bdev(index);
+ }
+
+ bdev = toi_open_by_devnum(device, FMODE_READ | FMODE_NDELAY);
+
+ if (IS_ERR(bdev) || !bdev) {
+ if (display_errs)
+ toi_early_boot_message(1, TOI_CONTINUE_REQ,
+ "Failed to get access to block device "
+ "\"%x\" (error %d).\n Maybe you need "
+ "to run mknod and/or lvmsetup in an "
+ "initrd/ramfs?", device, bdev);
+ return ERR_PTR(-EINVAL);
+ }
+
+ this = toi_kzalloc(8, sizeof(struct bdev_opened), GFP_KERNEL);
+ if (!this) {
+ printk(KERN_WARNING "TuxOnIce: Failed to allocate memory for "
+ "opening a bdev.");
+ blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ bdevs_opened[index] = this;
+ this->device = device;
+ this->bdev = bdev;
+
+ return bdev;
+}
+
+/**
+ * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
+ *
+ * Activate the given swapfile if it wasn't already enabled. Remember whether
+ * we really did swapon it for swapoffing later.
+ */
+static void enable_swapfile(void)
+{
+ int activateswapresult = -EINVAL;
+
+ if (swapfilename[0]) {
+ /* Attempt to swap on with maximum priority */
+ activateswapresult = sys_swapon(swapfilename, 0xFFFF);
+ if (activateswapresult && activateswapresult != -EBUSY)
+ printk("TuxOnIce: The swapfile/partition specified by "
+ "/sys/power/tuxonice/swap/swapfile "
+ "(%s) could not be turned on (error %d). "
+ "Attempting to continue.\n",
+ swapfilename, activateswapresult);
+ if (!activateswapresult)
+ toi_swapon_status = 1;
+ }
+}
+
+/**
+ * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
+ *
+ * If we did successfully swapon a file at the start of the cycle, swapoff
+ * it now (finishing up).
+ */
+static void disable_swapfile(void)
+{
+ if (!toi_swapon_status)
+ return;
+
+ sys_swapoff(swapfilename);
+ toi_swapon_status = 0;
+}
+
+/**
+ * try_to_parse_resume_device: Try to parse resume=
+ *
+ * Any "swap:" has been stripped away and we just have the path to deal with.
+ * We attempt to do name_to_dev_t, open and stat the file. Having opened the
+ * file, get the struct block_device * to match.
+ */
+static int try_to_parse_resume_device(char *commandline, int quiet)
+{
+ struct kstat stat;
+ int error = 0;
+
+ wait_for_device_probe();
+ resume_swap_dev_t = name_to_dev_t(commandline);
+
+ if (!resume_swap_dev_t) {
+ struct file *file = filp_open(commandline,
+ O_RDONLY|O_LARGEFILE, 0);
+
+ if (!IS_ERR(file) && file) {
+ vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
+ filp_close(file, NULL);
+ } else
+ error = vfs_stat(commandline, &stat);
+ if (!error)
+ resume_swap_dev_t = stat.rdev;
+ }
+
+ if (!resume_swap_dev_t) {
+ if (quiet)
+ return 1;
+
+ if (test_toi_state(TOI_TRYING_TO_RESUME))
+ toi_early_boot_message(1, TOI_CONTINUE_REQ,
+ "Failed to translate \"%s\" into a device id.\n",
+ commandline);
+ else
+ printk("TuxOnIce: Can't translate \"%s\" into a device "
+ "id yet.\n", commandline);
+ return 1;
+ }
+
+ resume_block_device = open_bdev(MAX_SWAPFILES, resume_swap_dev_t, 0);
+ if (IS_ERR(resume_block_device)) {
+ if (!quiet)
+ toi_early_boot_message(1, TOI_CONTINUE_REQ,
+ "Failed to get access to \"%s\", where"
+ " the swap header should be found.",
+ commandline);
+ return 1;
+ }
+
+ return 0;
+}
+
+/*
+ * If we have read part of the image, we might have filled memory with
+ * data that should be zeroed out.
+ */
+static void toi_swap_noresume_reset(void)
+{
+ toi_bio_ops.rw_cleanup(READ);
+ memset((char *) &devinfo, 0, sizeof(devinfo));
+}
+
+static int get_current_signature(void)
+{
+ if (!current_signature_page) {
+ current_signature_page = (char *) toi_get_zeroed_page(38,
+ TOI_ATOMIC_GFP);
+ if (!current_signature_page)
+ return -ENOMEM;
+ }
+
+ return toi_bio_ops.bdev_page_io(READ, resume_block_device,
+ resume_firstblock, virt_to_page(current_signature_page));
+}
+
+static int parse_signature(void)
+{
+ union p_diskpage swap_header_page;
+ struct sig_data *sig;
+ int type;
+ char *swap_header;
+ const char *sigs[] = {
+ "SWAP-SPACE", "SWAPSPACE2", "S1SUSP", "S2SUSP", "S1SUSPEND"
+ };
+
+ int result = get_current_signature();
+ if (result)
+ return result;
+
+ swap_header_page = (union p_diskpage) current_signature_page;
+ sig = (struct sig_data *) current_signature_page;
+ swap_header = swap_header_page.pointer->swh.magic.magic;
+
+ for (type = 0; type < 5; type++)
+ if (!memcmp(sigs[type], swap_header, strlen(sigs[type])))
+ return type;
+
+ if (memcmp(tuxonice_signature, swap_header, sizeof(tuxonice_signature)))
+ return -1;
+
+ header_dev_t = sig->device;
+ clear_toi_state(TOI_RESUMED_BEFORE);
+ if (sig->resume_attempted)
+ set_toi_state(TOI_RESUMED_BEFORE);
+ headerblock = sig->sector;
+
+ return 10;
+}
+
+static void forget_signatures(void)
+{
+ if (current_signature_page) {
+ toi_free_page(38, (unsigned long) current_signature_page);
+ current_signature_page = NULL;
+ }
+}
+
+/*
+ * write_modified_signature
+ *
+ * Write a (potentially) modified signature page without forgetting the
+ * original contents.
+ */
+static int write_modified_signature(int modification)
+{
+ union p_diskpage swap_header_page;
+ struct swap_info_struct *si;
+ int result;
+ char *orig_sig;
+
+ /* In case we haven't already */
+ result = get_current_signature();
+
+ if (result)
+ return result;
+
+ swap_header_page.address = toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
+
+ if (!swap_header_page.address)
+ return -ENOMEM;
+
+ memcpy(swap_header_page.ptr, current_signature_page, PAGE_SIZE);
+
+ switch (modification) {
+ case IMAGE_SIGNATURE:
+
+ memcpy(no_image_signature_contents, swap_header_page.ptr,
+ sizeof(no_image_signature_contents));
+
+ /* Get the details of the header first page. */
+ toi_extent_state_goto_start(&toi_writer_posn);
+ toi_bio_ops.forward_one_page(1, 1);
+
+ si = get_swap_info_struct(toi_writer_posn.current_chain);
+
+ /* Prepare the signature */
+ swap_header_page.pointer->sig_data.device = si->bdev->bd_dev;
+ swap_header_page.pointer->sig_data.sector =
+ toi_writer_posn.current_offset;
+ swap_header_page.pointer->sig_data.resume_attempted = 0;
+ swap_header_page.pointer->sig_data.orig_sig_type =
+ parse_signature();
+
+ memcpy(swap_header_page.pointer->swh.magic.magic,
+ tuxonice_signature, sizeof(tuxonice_signature));
+
+ break;
+ case NO_IMAGE_SIGNATURE:
+ if (!swap_header_page.pointer->sig_data.orig_sig_type)
+ orig_sig = "SWAP-SPACE";
+ else
+ orig_sig = "SWAPSPACE2";
+
+ memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
+ memcpy(swap_header_page.ptr, no_image_signature_contents,
+ sizeof(no_image_signature_contents));
+ break;
+ case TRIED_RESUME:
+ swap_header_page.pointer->sig_data.resume_attempted = 1;
+ break;
+ case NO_TRIED_RESUME:
+ swap_header_page.pointer->sig_data.resume_attempted = 0;
+ break;
+ }
+
+ result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
+ resume_firstblock, virt_to_page(swap_header_page.address));
+
+ memcpy(current_signature_page, swap_header_page.ptr, PAGE_SIZE);
+
+ toi_free_page(38, swap_header_page.address);
+
+ return result;
+}
+
+/*
+ * apply_header_reservation
+ */
+static int apply_header_reservation(void)
+{
+ int i;
+
+ toi_extent_state_goto_start(&toi_writer_posn);
+
+ for (i = 0; i < header_pages_reserved; i++)
+ if (toi_bio_ops.forward_one_page(1, 0))
+ return -ENOSPC;
+
+ /* The end of header pages will be the start of pageset 2;
+ * we are now sitting on the first pageset2 page. */
+ toi_extent_state_save(&toi_writer_posn, &toi_writer_posn_save[2]);
+ return 0;
+}
+
+static void toi_swap_reserve_header_space(int request)
+{
+ header_pages_reserved = (long) request;
+}
+
+static void free_block_chains(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_SWAPFILES; i++)
+ if (block_chain[i].first)
+ toi_put_extent_chain(&block_chain[i]);
+}
+
+static int add_blocks_to_extent_chain(int chain, int start, int end)
+{
+ if (test_action_state(TOI_TEST_BIO))
+ printk(KERN_INFO "Adding extent chain %d %d-%d.\n", chain,
+ start << devinfo[chain].bmap_shift,
+ end << devinfo[chain].bmap_shift);
+
+ if (toi_add_to_extent_chain(&block_chain[chain], start, end)) {
+ free_block_chains();
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+
+static int get_main_pool_phys_params(void)
+{
+ struct hibernate_extent *extentpointer = NULL;
+ unsigned long address;
+ int extent_min = -1, extent_max = -1, last_chain = -1;
+
+ free_block_chains();
+
+ toi_extent_for_each(&swapextents, extentpointer, address) {
+ swp_entry_t swap_address = (swp_entry_t) { address };
+ pgoff_t offset = swp_offset(swap_address);
+ unsigned swapfilenum = swp_type(swap_address);
+ struct swap_info_struct *sis =
+ get_swap_info_struct(swapfilenum);
+ sector_t new_sector = map_swap_page(sis, offset);
+
+ if (devinfo[swapfilenum].ignored)
+ continue;
+
+ if ((new_sector == extent_max + 1) &&
+ (last_chain == swapfilenum)) {
+ extent_max++;
+ continue;
+ }
+
+ if (extent_min > -1 && add_blocks_to_extent_chain(last_chain,
+ extent_min, extent_max)) {
+ printk(KERN_ERR "Out of memory while making block "
+ "chains.\n");
+ return -ENOMEM;
+ }
+
+ extent_min = new_sector;
+ extent_max = new_sector;
+ last_chain = swapfilenum;
+ }
+
+ if (extent_min > -1 && add_blocks_to_extent_chain(last_chain,
+ extent_min, extent_max)) {
+ printk(KERN_ERR "Out of memory while making block chains.\n");
+ return -ENOMEM;
+ }
+
+ return apply_header_reservation();
+}
+
+static long raw_to_real(long raw)
+{
+ long result;
+
+ result = raw - (raw * (sizeof(unsigned long) + sizeof(int)) +
+ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
+ (PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
+
+ return result < 0 ? 0 : result;
+}
+
+static int toi_swap_storage_allocated(void)
+{
+ return (int) raw_to_real(swap_pages_allocated - header_pages_reserved);
+}
+
+/*
+ * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
+ * and don't need to use the spinlocks (userspace is stopped when this
+ * function is called).
+ */
+void si_swapinfo_no_compcache(struct sysinfo *val)
+{
+ unsigned int i;
+
+ si_swapinfo(&swapinfo);
+ val->freeswap = 0;
+ val->totalswap = 0;
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ struct swap_info_struct *si = get_swap_info_struct(i);
+ if ((si->flags & SWP_USED) && si->swap_map &&
+ (si->flags & SWP_WRITEOK) &&
+ (strncmp(si->bdev->bd_disk->disk_name, "ram", 3))) {
+ val->totalswap += si->inuse_pages;
+ val->freeswap += si->pages - si->inuse_pages;
+ }
+ }
+}
+/*
+ * We can't just remember the value from allocation time, because other
+ * processes might have allocated swap in the mean time.
+ */
+static int toi_swap_storage_available(void)
+{
+ si_swapinfo_no_compcache(&swapinfo);
+ return (int) raw_to_real((long) swapinfo.freeswap +
+ swap_pages_allocated - header_pages_reserved);
+}
+
+static int toi_swap_initialise(int starting_cycle)
+{
+ int result = 0;
+
+ if (!starting_cycle)
+ return 0;
+
+ enable_swapfile();
+
+ if (resume_swap_dev_t && !resume_block_device) {
+ resume_block_device = open_bdev(MAX_SWAPFILES,
+ resume_swap_dev_t, 1);
+ if (IS_ERR(resume_block_device))
+ result = 1;
+ }
+
+ return result;
+}
+
+static void toi_swap_cleanup(int ending_cycle)
+{
+ if (ending_cycle)
+ disable_swapfile();
+
+ close_bdevs();
+
+ forget_signatures();
+}
+
+static int toi_swap_release_storage(void)
+{
+ header_pages_reserved = 0;
+ swap_pages_allocated = 0;
+
+ if (swapextents.first) {
+ /* Free swap entries */
+ struct hibernate_extent *extentpointer;
+ unsigned long extentvalue;
+ toi_extent_for_each(&swapextents, extentpointer,
+ extentvalue)
+ swap_free((swp_entry_t) { extentvalue });
+
+ toi_put_extent_chain(&swapextents);
+
+ free_block_chains();
+ }
+
+ return 0;
+}
+
+static void free_swap_range(unsigned long min, unsigned long max)
+{
+ int j;
+
+ for (j = min; j <= max; j++)
+ swap_free((swp_entry_t) { j });
+}
+
+/*
+ * Round robin allocation (where swap storage has the same priority).
+ * could make this very inefficient, so we track extents allocated on
+ * a per-swapfile basis.
+ */
+static int toi_swap_allocate_storage(int request)
+{
+ int i, result = 0, to_add[MAX_SWAPFILES], pages_to_get, extra_pages,
+ gotten = 0, result2;
+ unsigned long extent_min[MAX_SWAPFILES], extent_max[MAX_SWAPFILES];
+
+ extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
+ + sizeof(int)), PAGE_SIZE);
+ pages_to_get = request + extra_pages - swapextents.size +
+ header_pages_reserved;
+
+ if (pages_to_get < 1)
+ return apply_header_reservation();
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ struct swap_info_struct *si = get_swap_info_struct(i);
+ to_add[i] = 0;
+ if (!(si->flags & SWP_USED) || !si->swap_map ||
+ !(si->flags & SWP_WRITEOK))
+ continue;
+ if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3)) {
+ devinfo[i].ignored = 1;
+ continue;
+ }
+ devinfo[i].ignored = 0;
+ devinfo[i].bdev = si->bdev;
+ devinfo[i].dev_t = si->bdev->bd_dev;
+ devinfo[i].bmap_shift = 3;
+ devinfo[i].blocks_per_page = 1;
+ }
+
+ while (gotten < pages_to_get) {
+ swp_entry_t entry;
+ unsigned long new_value;
+ unsigned swapfilenum;
+
+ entry = get_swap_page();
+ if (!entry.val)
+ break;
+
+ swapfilenum = swp_type(entry);
+ new_value = entry.val;
+
+ if (!to_add[swapfilenum]) {
+ to_add[swapfilenum] = 1;
+ extent_min[swapfilenum] = new_value;
+ extent_max[swapfilenum] = new_value;
+ if (!devinfo[swapfilenum].ignored)
+ gotten++;
+ continue;
+ }
+
+ if (new_value == extent_max[swapfilenum] + 1) {
+ extent_max[swapfilenum]++;
+ if (!devinfo[swapfilenum].ignored)
+ gotten++;
+ continue;
+ }
+
+ if (toi_add_to_extent_chain(&swapextents,
+ extent_min[swapfilenum],
+ extent_max[swapfilenum])) {
+ printk(KERN_INFO "Failed to allocate extent for "
+ "%lu-%lu.\n", extent_min[swapfilenum],
+ extent_max[swapfilenum]);
+ free_swap_range(extent_min[swapfilenum],
+ extent_max[swapfilenum]);
+ swap_free(entry);
+ if (!devinfo[swapfilenum].ignored)
+ gotten -= (extent_max[swapfilenum] -
+ extent_min[swapfilenum] + 1);
+ /* Don't try to add again below */
+ to_add[swapfilenum] = 0;
+ break;
+ } else {
+ extent_min[swapfilenum] = new_value;
+ extent_max[swapfilenum] = new_value;
+ if (!devinfo[swapfilenum].ignored)
+ gotten++;
+ }
+ }
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ int this_result;
+
+ /* Anything to do for this swap entry? */
+ if (!to_add[i])
+ continue;
+
+ this_result = toi_add_to_extent_chain(&swapextents,
+ extent_min[i], extent_max[i]);
+
+ /* Added okay? */
+ if (!this_result)
+ continue;
+
+ /*
+ * Nope. Remember an error occured, free the swap and subtract
+ * from the amount of swap allocated.
+ */
+ result = this_result;
+
+ free_swap_range(extent_min[i], extent_max[i]);
+ if (!devinfo[i].ignored)
+ gotten -= (extent_max[i] - extent_min[i] + 1);
+ }
+
+ if (gotten < pages_to_get) {
+ printk("Got fewer pages than required "
+ "(%d wanted, %d gotten).\n",
+ pages_to_get, gotten);
+ result = -ENOSPC;
+ }
+
+ swap_pages_allocated += (long) gotten;
+
+ result2 = get_main_pool_phys_params();
+
+ return result ? result : result2;
+}
+
+static int toi_swap_write_header_init(void)
+{
+ int i, result;
+ struct swap_info_struct *si;
+
+ toi_bio_ops.rw_init(WRITE, 0);
+ toi_writer_buffer_posn = 0;
+
+ /* Info needed to bootstrap goes at the start of the header.
+ * First we save the positions and devinfo, including the number
+ * of header pages. Then we save the structs containing data needed
+ * for reading the header pages back.
+ * Note that even if header pages take more than one page, when we
+ * read back the info, we will have restored the location of the
+ * next header page by the time we go to use it.
+ */
+
+ result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
+ (char *) &no_image_signature_contents,
+ sizeof(struct sig_data));
+
+ if (result)
+ return result;
+
+ /* Forward one page will be done prior to the read */
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ si = get_swap_info_struct(i);
+ if (si->swap_file)
+ devinfo[i].dev_t = si->bdev->bd_dev;
+ else
+ devinfo[i].dev_t = (dev_t) 0;
+ }
+
+ result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
+ (char *) &toi_writer_posn_save,
+ sizeof(toi_writer_posn_save));
+
+ if (result)
+ return result;
+
+ result = toi_bio_ops.rw_header_chunk(WRITE, &toi_swapops,
+ (char *) &devinfo, sizeof(devinfo));
+
+ if (result)
+ return result;
+
+ for (i = 0; i < MAX_SWAPFILES; i++)
+ toi_serialise_extent_chain(&toi_swapops, &block_chain[i]);
+
+ return 0;
+}
+
+static int toi_swap_write_header_cleanup(void)
+{
+ int result = toi_bio_ops.write_header_chunk_finish();
+
+ /* Set signature to save we have an image */
+ if (!result)
+ result = write_modified_signature(IMAGE_SIGNATURE);
+
+ return result;
+}
+
+/* ------------------------- HEADER READING ------------------------- */
+
+/*
+ * read_header_init()
+ *
+ * Description:
+ * 1. Attempt to read the device specified with resume=.
+ * 2. Check the contents of the swap header for our signature.
+ * 3. Warn, ignore, reset and/or continue as appropriate.
+ * 4. If continuing, read the toi_swap configuration section
+ * of the header and set up block device info so we can read
+ * the rest of the header & image.
+ *
+ * Returns:
+ * May not return if user choose to reboot at a warning.
+ * -EINVAL if cannot resume at this time. Booting should continue
+ * normally.
+ */
+
+static int toi_swap_read_header_init(void)
+{
+ int i, result = 0;
+ toi_writer_buffer_posn = 0;
+
+ if (!header_dev_t) {
+ printk(KERN_INFO "read_header_init called when we haven't "
+ "verified there is an image!\n");
+ return -EINVAL;
+ }
+
+ /*
+ * If the header is not on the resume_swap_dev_t, get the resume device
+ * first.
+ */
+ if (header_dev_t != resume_swap_dev_t) {
+ header_block_device = open_bdev(MAX_SWAPFILES + 1,
+ header_dev_t, 1);
+
+ if (IS_ERR(header_block_device))
+ return PTR_ERR(header_block_device);
+ } else
+ header_block_device = resume_block_device;
+
+ toi_bio_ops.read_header_init();
+
+ /*
+ * Read toi_swap configuration.
+ * Headerblock size taken into account already.
+ */
+ result = toi_bio_ops.bdev_page_io(READ, header_block_device,
+ headerblock << 3,
+ virt_to_page((unsigned long) toi_writer_buffer));
+ if (result)
+ return result;
+
+ memcpy(&no_image_signature_contents, toi_writer_buffer,
+ sizeof(no_image_signature_contents));
+
+ toi_writer_buffer_posn = sizeof(no_image_signature_contents);
+
+ memcpy(&toi_writer_posn_save, toi_writer_buffer +
+ toi_writer_buffer_posn, sizeof(toi_writer_posn_save));
+
+ toi_writer_buffer_posn += sizeof(toi_writer_posn_save);
+
+ memcpy(&devinfo, toi_writer_buffer + toi_writer_buffer_posn,
+ sizeof(devinfo));
+
+ toi_writer_buffer_posn += sizeof(devinfo);
+
+ /* Restore device info */
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ dev_t thisdevice = devinfo[i].dev_t;
+ struct block_device *bdev_result;
+
+ devinfo[i].bdev = NULL;
+
+ if (!thisdevice || devinfo[i].ignored)
+ continue;
+
+ if (thisdevice == resume_swap_dev_t) {
+ devinfo[i].bdev = resume_block_device;
+ continue;
+ }
+
+ if (thisdevice == header_dev_t) {
+ devinfo[i].bdev = header_block_device;
+ continue;
+ }
+
+ bdev_result = open_bdev(i, thisdevice, 1);
+ if (IS_ERR(bdev_result))
+ return PTR_ERR(bdev_result);
+ devinfo[i].bdev = bdevs_opened[i]->bdev;
+ }
+
+ toi_extent_state_goto_start(&toi_writer_posn);
+ toi_bio_ops.set_extra_page_forward();
+
+ for (i = 0; i < MAX_SWAPFILES && !result; i++)
+ result = toi_load_extent_chain(&block_chain[i]);
+
+ return result;
+}
+
+static int toi_swap_read_header_cleanup(void)
+{
+ toi_bio_ops.rw_cleanup(READ);
+ return 0;
+}
+
+/*
+ * workspace_size
+ *
+ * Description:
+ * Returns the number of bytes of RAM needed for this
+ * code to do its work. (Used when calculating whether
+ * we have enough memory to be able to hibernate & resume).
+ *
+ */
+static int toi_swap_memory_needed(void)
+{
+ return 1;
+}
+
+/*
+ * Print debug info
+ *
+ * Description:
+ */
+static int toi_swap_print_debug_stats(char *buffer, int size)
+{
+ int len = 0;
+ struct sysinfo sysinfo;
+
+ if (toiActiveAllocator != &toi_swapops) {
+ len = scnprintf(buffer, size,
+ "- SwapAllocator inactive.\n");
+ return len;
+ }
+
+ len = scnprintf(buffer, size, "- SwapAllocator active.\n");
+ if (swapfilename[0])
+ len += scnprintf(buffer+len, size-len,
+ " Attempting to automatically swapon: %s.\n",
+ swapfilename);
+
+ si_swapinfo_no_compcache(&sysinfo);
+
+ len += scnprintf(buffer+len, size-len,
+ " Swap available for image: %d pages.\n",
+ (int) sysinfo.freeswap + toi_swap_storage_allocated());
+
+ return len;
+}
+
+/*
+ * Storage needed
+ *
+ * Returns amount of space in the swap header required
+ * for the toi_swap's data. This ignores the links between
+ * pages, which we factor in when allocating the space.
+ *
+ * We ensure the space is allocated, but actually save the
+ * data from write_header_init and therefore don't also define a
+ * save_config_info routine.
+ */
+static int toi_swap_storage_needed(void)
+{
+ int i, result;
+ result = sizeof(struct sig_data) + sizeof(toi_writer_posn_save) +
+ sizeof(devinfo);
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ result += 2 * sizeof(int);
+ result += (2 * sizeof(unsigned long) *
+ block_chain[i].num_extents);
+ }
+
+ return result;
+}
+
+/*
+ * Image_exists
+ *
+ * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
+ */
+static int toi_swap_image_exists(int quiet)
+{
+ int signature_found;
+
+ if (!resume_swap_dev_t) {
+ if (!quiet)
+ printk(KERN_INFO "Not even trying to read header "
+ "because resume_swap_dev_t is not set.\n");
+ return -1;
+ }
+
+ if (!resume_block_device) {
+ resume_block_device = open_bdev(MAX_SWAPFILES,
+ resume_swap_dev_t, 1);
+ if (IS_ERR(resume_block_device)) {
+ if (!quiet)
+ printk(KERN_INFO "Failed to open resume dev_t"
+ " (%x).\n", resume_swap_dev_t);
+ return -1;
+ }
+ }
+
+ signature_found = parse_signature();
+
+ switch (signature_found) {
+ case -ENOMEM:
+ return -1;
+ case -1:
+ if (!quiet)
+ printk(KERN_ERR "TuxOnIce: Unable to find a signature."
+ " Could you have moved a swap file?\n");
+ return -1;
+ case 0:
+ case 1:
+ if (!quiet)
+ printk(KERN_INFO "TuxOnIce: Normal swapspace found.\n");
+ return 0;
+ case 2:
+ case 3:
+ case 4:
+ if (!quiet)
+ printk(KERN_INFO "TuxOnIce: Detected another "
+ "implementation's signature.\n");
+ return 0;
+ case 10:
+ if (!quiet)
+ printk(KERN_INFO "TuxOnIce: Detected TuxOnIce binary "
+ "signature.\n");
+ return 1;
+ }
+
+ printk("Unrecognised parse_signature result (%d).\n", signature_found);
+ return 0;
+}
+
+/* toi_swap_remove_image
+ *
+ */
+static int toi_swap_remove_image(void)
+{
+ /*
+ * If nr_hibernates == 0, we must be booting, so no swap pages
+ * will be recorded as used yet.
+ */
+
+ if (nr_hibernates)
+ toi_swap_release_storage();
+
+ /*
+ * We don't do a sanity check here: we want to restore the swap
+ * whatever version of kernel made the hibernate image.
+ *
+ * We need to write swap, but swap may not be enabled so
+ * we write the device directly
+ *
+ * If we don't have an current_signature_page, we didn't
+ * read an image header, so don't change anything.
+ */
+
+ return toi_swap_image_exists(1) ?
+ write_modified_signature(NO_IMAGE_SIGNATURE) : 0;
+}
+
+/*
+ * Mark resume attempted.
+ *
+ * Record that we tried to resume from this image. We have already read the
+ * signature in. We just need to write the modified version.
+ */
+static int toi_swap_mark_resume_attempted(int mark)
+{
+ if (!resume_swap_dev_t) {
+ printk(KERN_INFO "Not even trying to record attempt at resuming"
+ " because resume_swap_dev_t is not set.\n");
+ return -ENODEV;
+ }
+
+ return write_modified_signature(mark ? TRIED_RESUME : NO_TRIED_RESUME);
+}
+
+/*
+ * Parse Image Location
+ *
+ * Attempt to parse a resume= parameter.
+ * Swap Writer accepts:
+ * resume=swap:DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
+ *
+ * Where:
+ * DEVNAME is convertable to a dev_t by name_to_dev_t
+ * FIRSTBLOCK is the location of the first block in the swap file
+ * (specifying for a swap partition is nonsensical but not prohibited).
+ * Data is validated by attempting to read a swap header from the
+ * location given. Failure will result in toi_swap refusing to
+ * save an image, and a reboot with correct parameters will be
+ * necessary.
+ */
+static int toi_swap_parse_sig_location(char *commandline,
+ int only_allocator, int quiet)
+{
+ char *thischar, *devstart, *colon = NULL;
+ int signature_found, result = -EINVAL, temp_result = 0;
+
+ if (strncmp(commandline, "swap:", 5)) {
+ /*
+ * Failing swap:, we'll take a simple
+ * resume=/dev/hda2, but fall through to
+ * other allocators if /dev/ isn't matched.
+ */
+ if (strncmp(commandline, "/dev/", 5))
+ return 1;
+ } else
+ commandline += 5;
+
+ devstart = commandline;
+ thischar = commandline;
+ while ((*thischar != ':') && (*thischar != '@') &&
+ ((thischar - commandline) < 250) && (*thischar))
+ thischar++;
+
+ if (*thischar == ':') {
+ colon = thischar;
+ *colon = 0;
+ thischar++;
+ }
+
+ while ((thischar - commandline) < 250 && *thischar)
+ thischar++;
+
+ if (colon) {
+ unsigned long block;
+ temp_result = strict_strtoul(colon + 1, 0, &block);
+ if (!temp_result)
+ resume_firstblock = (int) block;
+ } else
+ resume_firstblock = 0;
+
+ clear_toi_state(TOI_CAN_HIBERNATE);
+ clear_toi_state(TOI_CAN_RESUME);
+
+ if (!temp_result)
+ temp_result = try_to_parse_resume_device(devstart, quiet);
+
+ if (colon)
+ *colon = ':';
+
+ if (temp_result)
+ return -EINVAL;
+
+ signature_found = toi_swap_image_exists(quiet);
+
+ if (signature_found != -1) {
+ result = 0;
+
+ toi_bio_ops.set_devinfo(devinfo);
+ toi_writer_posn.chains = &block_chain[0];
+ toi_writer_posn.num_chains = MAX_SWAPFILES;
+ set_toi_state(TOI_CAN_HIBERNATE);
+ set_toi_state(TOI_CAN_RESUME);
+ } else
+ if (!quiet)
+ printk(KERN_ERR "TuxOnIce: SwapAllocator: No swap "
+ "signature found at %s.\n", devstart);
+ return result;
+}
+
+static int header_locations_read_sysfs(const char *page, int count)
+{
+ int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
+ struct inode *swapf = NULL;
+ int zone;
+ char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
+ char *path, *output = (char *) page;
+ int path_len;
+
+ if (!page)
+ return 0;
+
+ for (i = 0; i < MAX_SWAPFILES; i++) {
+ struct swap_info_struct *si = get_swap_info_struct(i);
+
+ if (!si->swap_file)
+ continue;
+
+ if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
+ haveswap = 1;
+ if (!printedpartitionsmessage) {
+ len += sprintf(output + len,
+ "For swap partitions, simply use the "
+ "format: resume=swap:/dev/hda1.\n");
+ printedpartitionsmessage = 1;
+ }
+ } else {
+ path_len = 0;
+
+ path = d_path(&si->swap_file->f_path, path_page,
+ PAGE_SIZE);
+ path_len = snprintf(path_page, 31, "%s", path);
+
+ haveswap = 1;
+ swapf = si->swap_file->f_mapping->host;
+ zone = bmap(swapf, 0);
+ if (!zone) {
+ len += sprintf(output + len,
+ "Swapfile %s has been corrupted. Reuse"
+ " mkswap on it and try again.\n",
+ path_page);
+ } else {
+ char name_buffer[255];
+ len += sprintf(output + len,
+ "For swapfile `%s`,"
+ " use resume=swap:/dev/%s:0x%x.\n",
+ path_page,
+ bdevname(si->bdev, name_buffer),
+ zone << (swapf->i_blkbits - 9));
+ }
+ }
+ }
+
+ if (!haveswap)
+ len = sprintf(output, "You need to turn on swap partitions "
+ "before examining this file.\n");
+
+ toi_free_page(10, (unsigned long) path_page);
+ return len;
+}
+
+static struct toi_sysfs_data sysfs_params[] = {
+ SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
+ SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
+ header_locations_read_sysfs, NULL, 0, NULL),
+ SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
+ attempt_to_parse_resume_device2),
+};
+
+static struct toi_module_ops toi_swapops = {
+ .type = WRITER_MODULE,
+ .name = "swap storage",
+ .directory = "swap",
+ .module = THIS_MODULE,
+ .memory_needed = toi_swap_memory_needed,
+ .print_debug_info = toi_swap_print_debug_stats,
+ .storage_needed = toi_swap_storage_needed,
+ .initialise = toi_swap_initialise,
+ .cleanup = toi_swap_cleanup,
+
+ .noresume_reset = toi_swap_noresume_reset,
+ .storage_available = toi_swap_storage_available,
+ .storage_allocated = toi_swap_storage_allocated,
+ .reserve_header_space = toi_swap_reserve_header_space,
+ .allocate_storage = toi_swap_allocate_storage,
+ .image_exists = toi_swap_image_exists,
+ .mark_resume_attempted = toi_swap_mark_resume_attempted,
+ .write_header_init = toi_swap_write_header_init,
+ .write_header_cleanup = toi_swap_write_header_cleanup,
+ .read_header_init = toi_swap_read_header_init,
+ .read_header_cleanup = toi_swap_read_header_cleanup,
+ .remove_image = toi_swap_remove_image,
+ .parse_sig_location = toi_swap_parse_sig_location,
+
+ .sysfs_data = sysfs_params,
+ .num_sysfs_entries = sizeof(sysfs_params) /
+ sizeof(struct toi_sysfs_data),
+};
+
+/* ---- Registration ---- */
+static __init int toi_swap_load(void)
+{
+ toi_swapops.rw_init = toi_bio_ops.rw_init;
+ toi_swapops.rw_cleanup = toi_bio_ops.rw_cleanup;
+ toi_swapops.read_page = toi_bio_ops.read_page;
+ toi_swapops.write_page = toi_bio_ops.write_page;
+ toi_swapops.rw_header_chunk = toi_bio_ops.rw_header_chunk;
+ toi_swapops.rw_header_chunk_noreadahead =
+ toi_bio_ops.rw_header_chunk_noreadahead;
+ toi_swapops.io_flusher = toi_bio_ops.io_flusher;
+ toi_swapops.update_throughput_throttle =
+ toi_bio_ops.update_throughput_throttle;
+ toi_swapops.finish_all_io = toi_bio_ops.finish_all_io;
+
+ return toi_register_module(&toi_swapops);
+}
+
+late_initcall(toi_swap_load);
If you replace tuxonice_core-objs with tuxonice_core-y in your MAkefile the
above can be simplified to:
tuxonice_core-$(CONFIG_NET) += tuxonice_netlink.o
(On the assumption that CONFIG_NET is a bool)
Sam
On Wed, 2009-05-06 at 23:03 +0200, Sam Ravnborg wrote:
> On Thu, May 07, 2009 at 12:39:05AM +1000, Nigel Cunningham wrote:
> > This patch adds support for communicating with a userspace helper via a
> > netlink socket. It is used by the userspace user interface support and
> > by the storage manager support.
> >
> > Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
> > ---
> > kernel/power/Makefile | 4 +
> > kernel/power/tuxonice_netlink.c | 339 +++++++++++++++++++++++++++++++++++++++
> > 2 files changed, 343 insertions(+), 0 deletions(-)
> > create mode 100644 kernel/power/tuxonice_netlink.c
> >
> > diff --git a/kernel/power/Makefile b/kernel/power/Makefile
> > index 07efc8a..180b89a 100644
> > --- a/kernel/power/Makefile
> > +++ b/kernel/power/Makefile
> > @@ -10,6 +10,10 @@ tuxonice_core-objs := tuxonice_modules.o tuxonice_sysfs.o tuxonice_highlevel.o \
> >
> > obj-$(CONFIG_TOI) += tuxonice_builtin.o
> >
> > +ifdef CONFIG_NET
> > +tuxonice_core-objs += tuxonice_netlink.o
> > +endif
>
> If you replace tuxonice_core-objs with tuxonice_core-y in your MAkefile the
> above can be simplified to:
> tuxonice_core-$(CONFIG_NET) += tuxonice_netlink.o
>
> (On the assumption that CONFIG_NET is a bool)
Thanks for the review.
I have support for building tuxonice as modules that I'm not asking to
be merged (at least not yet - it adds too many symbol exports at the
moment). As a result, I did the simplification but left the
tuxonice_core_objs definition. Is that what you'd do?
Regards,
Nigel
tuxonice_core-objs and tuxonice_core-y ahve equal semantics.
kbuild will recognize all .o files specified using tuxonice_core-y
as modules to be part of the tuxonice_core module.
So there is no need to define tuxonice_core-objs - that can
be replaced by tuxonice_core-y.
Sam
> I'd like to submit TuxOnIce for review, with a view to seeking to get it
> merged, perhaps in 2.6.31 or .32 (depending upon what needs work before
> it can be merged) and the willingness of those who matter.
>
> To briefly summarise the advantages to merging TuxOnIce:
>
> - Support for multiple swap devices
> - Support for non-swap (an ordinary file can be used)
> - Uses cryptoapi (LZO support, more than 2x speed of uncompressed data!)
> - Asynchronous I/O, readahead, multithreaded. Get the maximum throughput
> possible with your hardware.
> - Userspace user interface that lets you abort hibernating and abort
> resuming, get nice progress display etc.
All these are either done by uswsusp already, or could be done w/o
modifying kernel code.
> - 8 years of testing and improvement.
Yeah, 8 years of out of tree testing; while current code is used by
basically every distro out there.
> - Full image of memory (LRU pages that don't need to be atomically
> copied are saved prior to the atomic copy, then used as the
> destination of the atomic copy).
Yeah, that was the patch that did not go in
> - Support for resuming a different image after writing an image - makes
> powering down a UPS after writing an image doable.
This can be done w/o kernel code. Someone had a patch...
> - Simple to set up (works without any userspace binaries, uses existing
> resume= and noresume commandline parameters).
Given that distros ship uswsusp already... is that really an advantage.
To summarise disadvantages:
- only core has 8000 LoC
- it does stuff that can be easily done in userspace
(and that todays distros _do_ in userspace).
- it duplicates uswsusp functionality.
- compared to [u]swsusp, it received little testing
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
> Given that distros ship uswsusp already... is that really an advantage.
Uswsusp has no compression. A suspend-to-disk that can take up to 5
minutes in uswsusp takes 20 seconds with TuxOnIce. Uswsusp has no built-
in hooks that allow me to run commands and set options during the suspend
and resume process.
-Kenny
--
Kenneth R. Crudup Sr. SW Engineer, Scott County Consulting, Los Angeles
O: 3630 S. Sepulveda Blvd. #138, L.A., CA 90034-6809 (888) 454-8181
There are(/were) competing ieee1394 OHCI drivers and competing Bluetooth
USB HCI drivers, and that's just off the top of my head- so having
competing suspend/resume methods won't be anything new. Let the market
decide which is best.
Given that tuxonice DOES work reliably with dm-crypt encrypted swap
partitions, does uswsusp offer this functionality? Looking into the docs
on the sourceforge site does not give any information about this point.
Further, being someone with a crypto background, the userland suspend
helper does have some issues with the password/key handling:
1) no support for keys containing a 0 byte. (cryptsetup can handle this)
2) the rsa functionality seems to use the unsound practice to encrypt
the symmetric key directly without any padding (aka encoding in libgcrypt).
This is, unfortunately, something really fundamenta...
Just to add my 2 cents...
Ulrich
Actually, I see advantages of working together versus fighting flame wars.
Please stop that, I'm not going to take part in it this time.
Best,
Rafael
I'm not sure you're totally correct, but anyway patches are welcome to improve
it.
Also, as you can see from my previous message, I'm not a big fan of fighting
over this once again. Lets try to work together to do something productive
instead, shall we?
Best,
Rafael
> Lets try to work together to do something productive instead, shall we?
WHS.
-Kenny
--
Kenneth R. Crudup Sr. SW Engineer, Scott County Consulting, Los Angeles
O: 3630 S. Sepulveda Blvd. #138, L.A., CA 90034-6809 (888) 454-8181
thank you rafael. i would really like to see the fighting stop, too.
Perhaps the best palce to start would be to look directly at the
patches and, if there are weaknesses, discuss how to improve them?
m
Yes, I've talked about that already with Nigel and there's a kind of a plan.
Thanks,
For not American people this expands to .... ?
> Â Â Â Â -Kenny
>
> --
> Kenneth R. Crudup  Sr. SW Engineer, Scott County Consulting, Los Angeles
> O: 3630 S. Sepulveda Blvd. #138, L.A., CA 90034-6809 Â Â Â (888) 454-8181
> _______________________________________________
> TuxOnIce-devel mailing list
> TuxOnIc...@lists.tuxonice.net
> http://lists.tuxonice.net/mailman/listinfo/tuxonice-devel
On Thu, 7 May 2009, Fabio Comolli wrote:
> For not American people this expands to .... ?
Sorry, everyone; "What He Said", or "ditto".
-Kenny
--
Kenneth R. Crudup Sr. SW Engineer, Scott County Consulting, Los Angeles
O: 3630 S. Sepulveda Blvd. #138, L.A., CA 90034-6809 (888) 454-8181
Well, if I may jump in I suggest you start with the full-memory image
functionality. This is in my opinion the biggest advantage in tuxonice
vs. uswsusp.
> Thanks,
> Rafael
Regards,
Fabio
> _______________________________________________
> TuxOnIce-devel mailing list
> TuxOnIc...@lists.tuxonice.net
> http://lists.tuxonice.net/mailman/listinfo/tuxonice-devel
>
In fact I agree, but there's a catch. The way in which TuxOnIce operates LRU
pages is based on some assumptions that may or may not be satisfied in future,
so if we decide to merge it, then we'll have to make sure these assumptions
will be satisfied. That in turn is going to require quite some discussion I
guess.
We'll see how the things work out.
Thanks,
Rafael
On Thu, 2009-05-07 at 19:42 +0200, Rafael J. Wysocki wrote:
> Actually, I see advantages of working together versus fighting flame wars.
> Please stop that, I'm not going to take part in it this time.
And neither am I.
Regards,
Nigel
On Thu, 2009-05-07 at 21:27 +0200, Rafael J. Wysocki wrote:
> In fact I agree, but there's a catch. The way in which TuxOnIce operates LRU
> pages is based on some assumptions that may or may not be satisfied in future,
> so if we decide to merge it, then we'll have to make sure these assumptions
> will be satisfied. That in turn is going to require quite some discussion I
> guess.
Agreed. That's why I've got that GEMS patch - it's putting pages on the
LRU that don't satisfy the former assumptions: they are used during
hibernating and need to be atomically copied. If there are further
developments in that area, I would hope we could just extend what's been
done with GEMS.
Regards,
Nigel
> >> - Support for multiple swap devices
> >> - Support for non-swap (an ordinary file can be used)
> >> - Uses cryptoapi (LZO support, more than 2x speed of uncompressed data!)
> >> - Asynchronous I/O, readahead, multithreaded. Get the maximum throughput
> >> possible with your hardware.
> >> - Userspace user interface that lets you abort hibernating and abort
> >> resuming, get nice progress display etc.
> >
> > All these are either done by uswsusp already, or could be done w/o
> > modifying kernel code.
>
> Given that tuxonice DOES work reliably with dm-crypt encrypted swap
> partitions, does uswsusp offer this functionality? Looking into the docs
> on the sourceforge site does not give any information about this
> - point.
uswsusp does not care what kind of device it writes on, so yes,
dm-crypt works. IIRC seife was working on some YaST scripts so that
setting it up was easy for users.
> Further, being someone with a crypto background, the userland suspend
> helper does have some issues with the password/key handling:
> 1) no support for keys containing a 0 byte. (cryptsetup can handle
> this)
"passwords containing 0 byte" ? Is that a big deal? How would user
enter such password?
> 2) the rsa functionality seems to use the unsound practice to encrypt
> the symmetric key directly without any padding (aka encoding in libgcrypt).
If this is easy to fix, can you tell me what to do?
Otherwise, can you write a short note to bugtraq? Having bad crypto in
uswsusp would indeed be very very bad.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Neither tuxonice nor swsusp can write full-memory image (it is mostly
impossible to do in unlikely case of all memory is consumed by
kmalloc). Now, tuxonice can produce bigger images than swsusp...
Rafael had short patch for similar efect some time ago, but we could
not find anyone to really review it...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Ok, so what do you propose? Merging tuxonice into 2.6.32, resulting in
having swsusp,uswsusp *and* tuxonice to maintain? I hope not.
If we are talking about improving mainline to allow tuxonice
functionality... then yes, that sounds reasonable.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
>
>
>To summarise disadvantages:
>
>- only core has 8000 LoC
>- it does stuff that can be easily done in userspace
> (and that todays distros _do_ in userspace).
>- it duplicates uswsusp functionality.
>- compared to [u]swsusp, it received little testing
>
To summarise advatages - for me tuxonice is the only hibernation method that works.
(Till now I've had 3 machines - no one of them able to resume with in-kernel swsusp.)
>_______________________________________________
>TuxOnIce-devel mailing list
>TuxOnIc...@lists.tuxonice.net
>http://lists.tuxonice.net/mailman/listinfo/tuxonice-devel
>
--
> Hi.
>
> On Thu, 2009-05-07 at 21:27 +0200, Rafael J. Wysocki wrote:
> > In fact I agree, but there's a catch. The way in which TuxOnIce
> > operates LRU pages is based on some assumptions that may or may not
> > be satisfied in future, so if we decide to merge it, then we'll
> > have to make sure these assumptions will be satisfied. That in
> > turn is going to require quite some discussion I guess.
>
> Agreed. That's why I've got that GEMS patch - it's putting pages on
> the LRU that don't satisfy the former assumptions: they are used
> during hibernating and need to be atomically copied. If there are
> further developments in that area, I would hope we could just extend
> what's been done with GEMS.
Another option here would be to suspend all DRM operations earlier.
The suspend hook for i915 already does this, but maybe it needs to
happen sooner? We'll probably want a generic DRM suspend hook soon too
(as the radeon memory manager lands) to shut down GPU activity in the
suspend and hibernate cases.
All that assumes I understand what's going on here though. :) It
appears you delay saving the GEM (just GEM by the way, for Graphics/GPU
Execution Manager) backing store until late to avoid having the pages
move around out from under you?
Thanks,
Jesse
On Thu, 2009-05-07 at 16:14 -0700, Jesse Barnes wrote:
> On Fri, 08 May 2009 06:41:00 +1000
> Nigel Cunningham <ni...@tuxonice.net> wrote:
>
> > Hi.
> >
> > On Thu, 2009-05-07 at 21:27 +0200, Rafael J. Wysocki wrote:
> > > In fact I agree, but there's a catch. The way in which TuxOnIce
> > > operates LRU pages is based on some assumptions that may or may not
> > > be satisfied in future, so if we decide to merge it, then we'll
> > > have to make sure these assumptions will be satisfied. That in
> > > turn is going to require quite some discussion I guess.
> >
> > Agreed. That's why I've got that GEMS patch - it's putting pages on
> > the LRU that don't satisfy the former assumptions: they are used
> > during hibernating and need to be atomically copied. If there are
> > further developments in that area, I would hope we could just extend
> > what's been done with GEMS.
>
> Another option here would be to suspend all DRM operations earlier.
> The suspend hook for i915 already does this, but maybe it needs to
> happen sooner? We'll probably want a generic DRM suspend hook soon too
> (as the radeon memory manager lands) to shut down GPU activity in the
> suspend and hibernate cases.
>
> All that assumes I understand what's going on here though. :) It
> appears you delay saving the GEM (just GEM by the way, for Graphics/GPU
> Execution Manager) backing store until late to avoid having the pages
> move around out from under you?
Yeah. TuxOnIce saves some pages without doing an atomic copy of them. Up
'til now, the algorithm has been LRU pages - pages used for TuxOnIce's
userspace helpers. With GEM, we also need to make sure GEM pages are
atomically copied and so also 'subtract' them from the list of pages
that aren't atomically copied.
It's no great problem to do this, so I wouldn't ask you to change GEM to
suspend DRM operations earlier. It's more important that GEM doesn't
allocate extra pages unexpectedly - and I don't think that's likely
anyway since we've switched away from X. This is important because
TuxOnIce depends (for reliability) on having memory usage being
predictable much more than swsusp and uswsusp do. (Larger images, less
free RAM to begin with).
Regards,
Nigel
Yeah X is typically the one causing GEM allocations and performing
execution, but there are other possibilities too. E.g. Wayland is a
non-X based display system that may be running instead, or maybe
there's an EGL or GPGPU program running in the background.
So I think it's best if we suspend DRM fairly early, otherwise you
*may* get extra allocations and will probably see all sorts of GPU
memory mapping activity and execution while you're trying to hibernate
things. On the plus side I don't think this is a radical redesign or
anything, and mostly something we can do in our suspend and hibernate
callbacks.
Thanks,
Jesse
Works here. Just tried (s2disk, vanilla 2.6.29). It also claims you
can hit "Del" to abort and to that it compressed the image.
Alex
That won't stop updates to the framebuffer?
> Thanks,
Thank _you_!
Nigel
I think hibernate callbacks are too late for that. PM notifiers, OTOH, are
probably too early.
So, I really think we'll need to do something special about it.
Thanks,
Rafael
No you can still have the framebuffer mapped and write to it (as long
as we don't invalidate such mappings at DRM suspend time that is, but
there's no reason to do that).
Does that mean tuxonice writes directly to framebuffer memory when
suspending? Or does it just rely on a userspace program that does
that? I'm just curious, either way should work fine, drm-wise.
Jesse
It doesn't write directly to the framebuffer memory. It may (or may not)
have a userspace program running that uses the framebuffer to display
progress.
Regards,
Nigel
TOI does not do this and that's why I've been using it on every distro I
can for the past 4 or 5 years. I for one would love to see TOI
capability added to the mainline to improve functionality and
performance. If we can't get all 3 maintained, then 2 that are better
would seem to suffice...
On Thu, 2009-05-07 at 23:51 +0200, Pavel Machek wrote:
> On Thu 2009-05-07 19:42:54, Rafael J. Wysocki wrote:
> > On Thursday 07 May 2009, Pavel Machek wrote:
> > > Hi!
> > >
> > > > I'd like to submit TuxOnIce for review, with a view to seeking to get it
> > > > merged, perhaps in 2.6.31 or .32 (depending upon what needs work before
> > > > it can be merged) and the willingness of those who matter.
> ...
> > > To summarise disadvantages:
> > >
> > > - only core has 8000 LoC
> > > - it does stuff that can be easily done in userspace
> > > (and that todays distros _do_ in userspace).
> > > - it duplicates uswsusp functionality.
> > > - compared to [u]swsusp, it received little testing
> >
> > Actually, I see advantages of working together versus fighting flame wars.
> > Please stop that, I'm not going to take part in it this time.
>
> Ok, so what do you propose? Merging tuxonice into 2.6.32, resulting in
> having swsusp,uswsusp *and* tuxonice to maintain? I hope not.
>
> If we are talking about improving mainline to allow tuxonice
> functionality... then yes, that sounds reasonable.
I'd like to see use have all three for one or two releases of vanilla,
just to give time to work out any issues that haven't been foreseen.
Once we're all that there are confident there are no regressions with
TuxOnIce, I'd remove swsusp. That's my ideal plan of attack.
Regards,
Nigel
Pavel, please. (u)swsusp can save images up to half the physical
memory IIRC; with toi I used to easily save 980MB of image on a 1GB
laptop. You can't honestly compare the two things.
> Rafael had short patch for similar efect some time ago, but we could
> not find anyone to really review it...
> Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Pavel
> --
Regards,
Fabio
So this is an idea to replace our current hibernation implementation with
TuxOnIce.
Which unfortunately I don't agree with.
I think we can get _one_ implementation out of the three, presumably keeping
the user space interface that will keep the current s2disk binaries happy, by
merging TuxOnIce code _gradually_. No "all at once" approach, please.
And by "merging" I mean _exactly_ that. Not adding new code and throwing
away the old one.
While I can work on creating one hibernation implementation by taking the
best ideas from all of the implementation we have at hand, I surely won't be
working on replacing our current code with TuxOnIce. If that disappoints you,
then I'm sorry.
Best,
Rafael
Please proceed to Plan B then.
Adding third core code framework to do the same thing is out of question
(probably same should have been said about adding second one in the past).
Moreover you will for sure hit much more regressions than you expect
(I "love" seeing over and over again when people/companies get trapped
into fallacy of superiority of their _own_ solution).
I really wouldn't consider teaming with Rafael to work on "swsuspOnTux"
(evolving the in-kernel code while re-using chunks of TuxOnIce code) as
a bad Plan B. It has the potential of resulting in a solution clearly
superior to all existing ones (TuxOnIce included).
Thanks,
Bart
Thanks for saying this Bart! That also is my point, I think we can create a
hibernation implementation which is clearly better than all we have right now
by joining forces without the "my code is *so* much better than yours" kind of
argumentation.
Also, I think we should aim at targets that are possible to achieve. Namely,
it is clear to me that various parts of TuxOnIce will have to be reviewed and
commented by the memory management people, the file systems people, the
architecture-dependent code maintainers and so on, and it is much easier to
get someone to have a look at a relatively short series of patches doing one
specific thing at a time than to have him review an entire subsystem.
Best,
Rafael
On Fri, May 8, 2009 at 3:17 AM, Shannon McMackin <smcm...@gmail.com> wrote:
..
> Just to add my 2 cents as a user of TOI. Â Every distro and release I've
> tried has one major issue with kernel hibernation. Â Upon resume when
> hibernating large images, there's a residual footprint in swap. Â Every
> further hibernation creates a larger footprint, to the order of an
> additional 5-7% each time. Â Nobody has ever cared in any forum to
> explain why or how I might change that.
>
Actually I'm not seeing this with uswsusp and 2.6.29.2:
fcomolli@hawking:~> dmesg|grep 'Preparing to enter system sleep state S4'
ACPI: Preparing to enter system sleep state S4
ACPI: Preparing to enter system sleep state S4
ACPI: Preparing to enter system sleep state S4
fcomolli@hawking:~> free
total used free shared buffers cached
Mem: 2039692 901252 1138440 0 118980 395236
-/+ buffers/cache: 387036 1652656
Swap: 2104504 0 2104504
I limit the image size to 950M.
Regards,
Fabio
That's my ideal. I know you and Pavel don't want to see us go down that
path, but I was asked "What do you propose?" and I answered that
question.
> Which unfortunately I don't agree with.
>
> I think we can get _one_ implementation out of the three, presumably keeping
> the user space interface that will keep the current s2disk binaries happy, by
> merging TuxOnIce code _gradually_. No "all at once" approach, please.
>
> And by "merging" I mean _exactly_ that. Not adding new code and throwing
> away the old one.
>
> While I can work on creating one hibernation implementation by taking the
> best ideas from all of the implementation we have at hand, I surely won't be
> working on replacing our current code with TuxOnIce. If that disappoints you,
> then I'm sorry.
But who is going to do that, and how and when. You're clearly too busy
working on enhancements to the driver model - enhancements that are good
and necessary (I'm not at all meaning this is bad). I'm only doing this
in a little bit of spare time. Pavel doesn't seem to be doing it at all.
And we have different ideas about how things should be done. Userspace
vs kernel space. Providing tuning knobs vs not. And so on.
And the code includes some fundamental differences. I freeze processes
and prepare the whole image before saving anything or doing an atomic
copy whereas you just free memory before doing the atomic copy. You save
everything in one part whereas I save the image in two parts. I take a
modular approach and you have everything hardwired.
Even if we did try to merge the three implementations, there'd come a
point where we either threw away core parts of TuxOnIce or dropped the
whole of [u]swsusp and started again - or both.
It doesn't disappoint me that you don't won't to replace [u]swsusp with
TuxOnIce. I never thought you or Pavel would want to do that.
I did hold out a little hope that you at least might be supportive
anyway of getting TuxOnIce added into vanilla - if only so that users
can get a better hibernation experience while we work through merging
the three into one. I'd far rather get along well with you than have
some sort of competitive relationship.
Regards,
Nigel
On Fri, 2009-05-08 at 21:44 +0200, Bartlomiej Zolnierkiewicz wrote:
> Please proceed to Plan B then.
>
> Adding third core code framework to do the same thing is out of question
> (probably same should have been said about adding second one in the past).
Why? We have plenty of history of having multiple implementations of
things (slub, slab and slob...).
> Moreover you will for sure hit much more regressions than you expect
> (I "love" seeing over and over again when people/companies get trapped
> into fallacy of superiority of their _own_ solution).
That's just wrong. TuxOnIce deliberately doesn't remove any of swsusp or
uswsusp so that there's no chance of users having regressions. It
provides the _option_ of being a drop in replacement for swsusp, but it
doesn't force users to take that option.
Regressions happen at the moment because TuxOnIce isn't included in
vanilla. Users update from one version of stable to the next or from one
version of head to the next and expect TuxOnIce to keep working, and it
doesn't always do that because of changes to the vanilla code that
require an updated patch.
> I really wouldn't consider teaming with Rafael to work on "swsuspOnTux"
> (evolving the in-kernel code while re-using chunks of TuxOnIce code) as
> a bad Plan B. It has the potential of resulting in a solution clearly
> superior to all existing ones (TuxOnIce included).
If there are features in swsusp that TuxOnIce is lacking, or features to
uswsusp that TuxOnIce is lacking, please tell me what they are and I'll
implement them. In saying this, I don't deny that TuxOnIce code can
still be improved - there's a lot I'd still like to do.
Regards,
Nigel
On Fri, 2009-05-08 at 23:03 +0200, Rafael J. Wysocki wrote:
> Also, I think we should aim at targets that are possible to achieve.
> Namely,
> it is clear to me that various parts of TuxOnIce will have to be reviewed and
> commented by the memory management people, the file systems people, the
> architecture-dependent code maintainers and so on, and it is much easier to
> get someone to have a look at a relatively short series of patches doing one
> specific thing at a time than to have him review an entire subsystem.
Seeking review is why I've posted the code.
I'm not sure what file systems people have to review in what I've
posted.
I guess the memory management part would be the separate saving of LRU
pages and the means by which GEM pages are tracked.
Re architecture dependent code maintainers: I don't know what they'd
have to do. Nothing in arch/ is touched by the patches I posted.
Regards,
Nigel
Hi,
So you could also easily anticipate my reaction. :-)
Quite frankly, I don't really think this is realistic.
First, because it is technically too difficult to have all of the code reviewed
and _agreed_ _upon_ by everyone at once. And if it's not agreed upon, you'll
have to modify it and it won't be the same thing any more once you've done
that. Which I'd say is guaranteed, having had a quick look at the code.
Second, because realistically you shouldn't expect _anyone_ (be it me or Pavel
or just about anybody else) to throw away his own code and replace it with
yours just because *you* think your code is better. You really should have
listened to the HPA's talk at the OLS last year (here's a link to the paper
http://ols.fedoraproject.org/OLS/Reprints-2008/anvin-reprint.pdf, please see
Section 10) which was about merging open source projects, among other things. :-)
> > Which unfortunately I don't agree with.
> >
> > I think we can get _one_ implementation out of the three, presumably keeping
> > the user space interface that will keep the current s2disk binaries happy, by
> > merging TuxOnIce code _gradually_. No "all at once" approach, please.
> >
> > And by "merging" I mean _exactly_ that. Not adding new code and throwing
> > away the old one.
> >
> > While I can work on creating one hibernation implementation by taking the
> > best ideas from all of the implementation we have at hand, I surely won't be
> > working on replacing our current code with TuxOnIce. If that disappoints you,
> > then I'm sorry.
>
> But who is going to do that, and how and when. You're clearly too busy
> working on enhancements to the driver model - enhancements that are good
> and necessary (I'm not at all meaning this is bad). I'm only doing this
> in a little bit of spare time. Pavel doesn't seem to be doing it at all.
I think I can find some time to work on that. I've spent a lot of time
recently on improving the allocation of memory for hibernation images and I
think I can work on the other hibernation-related things either. The most
important thing to me is whether or not to put that into my todo list. If I
decide to do it, I'll find the time too.
> And we have different ideas about how things should be done. Userspace
> vs kernel space. Providing tuning knobs vs not. And so on.
This isn't _that_ important. Actually, I'm not against an entirely in-kernel
solution, as there are some clear benefits of doing it this way. We only
need to be careful enough not to break the existing setups.
> And the code includes some fundamental differences. I freeze processes
> and prepare the whole image before saving anything or doing an atomic
> copy whereas you just free memory before doing the atomic copy. You save
> everything in one part whereas I save the image in two parts.
IMO the differences are not that fundamental. The whole problem boils down
to using the same data structures for memory management and I think we can
reach an agreement here.
> I take a modular approach and you have everything hardwired.
That's because using modules woudn't really make sense for us. :-)
> Even if we did try to merge the three implementations, there'd come a
> point where we either threw away core parts of TuxOnIce or dropped the
> whole of [u]swsusp and started again - or both.
I would be for starting again, really, using the experience we've collected so
far. How we technically do it is another matter. I personally would prefer
adding new code in such a way that it's useable from the start, so that it gets
tested and integrated with all of the subsystems we're touching.
> It doesn't disappoint me that you don't won't to replace [u]swsusp with
> TuxOnIce. I never thought you or Pavel would want to do that.
>
> I did hold out a little hope that you at least might be supportive
> anyway of getting TuxOnIce added into vanilla -
That would take lot of work and we'd also have to ask many other busy people
to do a lot of work for us. I think it's better to just avoid it at this point.
> if only so that users can get a better hibernation experience while we work
> through merging the three into one. I'd far rather get along well with you
> than have some sort of competitive relationship.
Great, let's do our best to be productive.
Best,
Rafael
With all respect to sl*b developers but things such as sl*b etc. are on
whole different level when it comes to complexity because their interactions
with weird hardware configurations are quite limited.
> > Moreover you will for sure hit much more regressions than you expect
> > (I "love" seeing over and over again when people/companies get trapped
> > into fallacy of superiority of their _own_ solution).
>
> That's just wrong. TuxOnIce deliberately doesn't remove any of swsusp or
> uswsusp so that there's no chance of users having regressions. It
> provides the _option_ of being a drop in replacement for swsusp, but it
> doesn't force users to take that option.
OK. What is exactly your plan for transition and for swsusp removal then?
> Regressions happen at the moment because TuxOnIce isn't included in
> vanilla. Users update from one version of stable to the next or from one
> version of head to the next and expect TuxOnIce to keep working, and it
> doesn't always do that because of changes to the vanilla code that
> require an updated patch.
I mean [u]swsusp -> TuxOnIce regressions.
> > I really wouldn't consider teaming with Rafael to work on "swsuspOnTux"
> > (evolving the in-kernel code while re-using chunks of TuxOnIce code) as
> > a bad Plan B. It has the potential of resulting in a solution clearly
> > superior to all existing ones (TuxOnIce included).
>
> If there are features in swsusp that TuxOnIce is lacking, or features to
> uswsusp that TuxOnIce is lacking, please tell me what they are and I'll
> implement them. In saying this, I don't deny that TuxOnIce code can
> still be improved - there's a lot I'd still like to do.
Instead of new features I would rather see more effort being put into making
the _core_ TuxOnIce (I mean patch #8 here) smaller (8 KLOC is still a lot,
just to put things into the right perspective the current in-kernel content
of kernel/power/ is 5.5 KLOC) and with more documentation inside the code.
Thanks,
Bart
On Sat, 2009-05-09 at 01:05 +0200, Bartlomiej Zolnierkiewicz wrote:
> On Friday 08 May 2009 23:59:31 Nigel Cunningham wrote:
> > Hi.
> >
> > On Fri, 2009-05-08 at 21:44 +0200, Bartlomiej Zolnierkiewicz wrote:
> > > Please proceed to Plan B then.
> > >
> > > Adding third core code framework to do the same thing is out of question
> > > (probably same should have been said about adding second one in the past).
> >
> > Why? We have plenty of history of having multiple implementations of
> > things (slub, slab and slob...).
>
> With all respect to sl*b developers but things such as sl*b etc. are on
> whole different level when it comes to complexity because their interactions
> with weird hardware configurations are quite limited.
No, it's apples with apples. The hibernation code we're talking about
doesn't do "interactions with weird hardware configurations". It does
the job of preparing a hibernation image, compressing and writing that
image to disk, showing the user what's going on and doing the reverse at
resume time.
> > > Moreover you will for sure hit much more regressions than you expect
> > > (I "love" seeing over and over again when people/companies get trapped
> > > into fallacy of superiority of their _own_ solution).
> >
> > That's just wrong. TuxOnIce deliberately doesn't remove any of swsusp or
> > uswsusp so that there's no chance of users having regressions. It
> > provides the _option_ of being a drop in replacement for swsusp, but it
> > doesn't force users to take that option.
>
> OK. What is exactly your plan for transition and for swsusp removal then?
1) Add the TuxOnIce patches given and enable the compile time option for
replacing swsusp by default for a couple of releases.
2) If people find problems, tell them to try using swsusp (echo 0
> /sys/power/tuxonice/replace_swsusp and then try hibernating) and seek
the cause of the regression.
3) Once we're satisfied that there are no regressions or they're fixed,
prepare code to remove swsusp.
> > Regressions happen at the moment because TuxOnIce isn't included in
> > vanilla. Users update from one version of stable to the next or from one
> > version of head to the next and expect TuxOnIce to keep working, and it
> > doesn't always do that because of changes to the vanilla code that
> > require an updated patch.
>
> I mean [u]swsusp -> TuxOnIce regressions.
I know. I think I dealt with that above.
> > > I really wouldn't consider teaming with Rafael to work on "swsuspOnTux"
> > > (evolving the in-kernel code while re-using chunks of TuxOnIce code) as
> > > a bad Plan B. It has the potential of resulting in a solution clearly
> > > superior to all existing ones (TuxOnIce included).
> >
> > If there are features in swsusp that TuxOnIce is lacking, or features to
> > uswsusp that TuxOnIce is lacking, please tell me what they are and I'll
> > implement them. In saying this, I don't deny that TuxOnIce code can
> > still be improved - there's a lot I'd still like to do.
>
> Instead of new features I would rather see more effort being put into making
> the _core_ TuxOnIce (I mean patch #8 here) smaller (8 KLOC is still a lot,
> just to put things into the right perspective the current in-kernel content
> of kernel/power/ is 5.5 KLOC) and with more documentation inside the code.
Yeah, but those 2.5k extra lines get you more reliability and extra
functionality. They're not fat.
Regarding documentation - yes, I could put more in the code. That's an
area I want to improve too. I don't think, however, that it should be
considered a barrier to merging the code into vanilla now.
Nigel
I think you're putting unrealistic barriers in the way. Does all code
that goes into the kernel get "reviewed and agreed upon by everyone at
once"? No! That's why bugs get in that have to be fixed and why things
go in despite the disagreement of some people. If there are good,
technical reasons why TuxOnIce shouldn't be merged, that's valid. But I
haven't seen one technical argument against merging TuxOnIce yet. It's
all just a preference for doing things gradually; a preference that's
unrealistic (there are too many differences) and will lead to it being
another 8 years of development before there's a really good,
user-friendly and feature complete hibernation implementation in the
kernel.
> Second, because realistically you shouldn't expect _anyone_ (be it me or Pavel
> or just about anybody else) to throw away his own code and replace it with
> yours just because *you* think your code is better. You really should have
> listened to the HPA's talk at the OLS last year (here's a link to the paper
> http://ols.fedoraproject.org/OLS/Reprints-2008/anvin-reprint.pdf, please see
> Section 10) which was about merging open source projects, among other things. :-)
<frustration>
This is going to sound arrogant, but please don't take it that way: I
can't see any other way of putting it. I don't *think* my code is
better. It is better. swsusp has essentially stood still since Pavel
first forked the code and got it merged. Yes, you have done some great
work on improving the code too and yes, you've done your work on the
version that was already merged. But your changes been more in the area
of fixing/improving what's already there than adding new and useful
features. On the other side, I've continued to improve the code, adding
new features (support for multiple swap partitions & files, for writing
to ordinary files, for mulithreaded I/O etc etc) making it more useful
and more reliable. There are some new features that have been put in
swsusp, but in just about every case (I think there might be an
exception or two), they're things TuxOnIce had for ages before. eg: SMP
support came with cpu hotplugging in 2.6.12 or so. TuxOnIce had SMP
support in 2.4.
</frustration>
> > > Which unfortunately I don't agree with.
> > >
> > > I think we can get _one_ implementation out of the three, presumably keeping
> > > the user space interface that will keep the current s2disk binaries happy, by
> > > merging TuxOnIce code _gradually_. No "all at once" approach, please.
> > >
> > > And by "merging" I mean _exactly_ that. Not adding new code and throwing
> > > away the old one.
> > >
> > > While I can work on creating one hibernation implementation by taking the
> > > best ideas from all of the implementation we have at hand, I surely won't be
> > > working on replacing our current code with TuxOnIce. If that disappoints you,
> > > then I'm sorry.
> >
> > But who is going to do that, and how and when. You're clearly too busy
> > working on enhancements to the driver model - enhancements that are good
> > and necessary (I'm not at all meaning this is bad). I'm only doing this
> > in a little bit of spare time. Pavel doesn't seem to be doing it at all.
>
> I think I can find some time to work on that. I've spent a lot of time
> recently on improving the allocation of memory for hibernation images and I
> think I can work on the other hibernation-related things either. The most
> important thing to me is whether or not to put that into my todo list. If I
> decide to do it, I'll find the time too.
Okay.
> > And we have different ideas about how things should be done. Userspace
> > vs kernel space. Providing tuning knobs vs not. And so on.
>
> This isn't _that_ important. Actually, I'm not against an entirely in-kernel
> solution, as there are some clear benefits of doing it this way. We only
> need to be careful enough not to break the existing setups.
Agreed.
> > And the code includes some fundamental differences. I freeze processes
> > and prepare the whole image before saving anything or doing an atomic
> > copy whereas you just free memory before doing the atomic copy. You save
> > everything in one part whereas I save the image in two parts.
>
> IMO the differences are not that fundamental. The whole problem boils down
> to using the same data structures for memory management and I think we can
> reach an agreement here.
I think we might be able to agree on using the same data structures, but
I'm not so sure about algorithms - I think you're underestimating the
differences here.
> > I take a modular approach and you have everything hardwired.
>
> That's because using modules woudn't really make sense for us. :-)
I'm not saying modules but modular. TuxOnIce has support for compression
neatly abstracted into one file, for swap in another and so on.
[u]swsusp doesn't.
Regarding building as modules, though, it does make sense - especially
in the embedded case. Hibernation code isn't needed 99% of the time. Why
have it taking up memory that could be used for other things?
> > Even if we did try to merge the three implementations, there'd come a
> > point where we either threw away core parts of TuxOnIce or dropped the
> > whole of [u]swsusp and started again - or both.
>
> I would be for starting again, really, using the experience we've collected so
> far. How we technically do it is another matter. I personally would prefer
> adding new code in such a way that it's useable from the start, so that it gets
> tested and integrated with all of the subsystems we're touching.
If we have to go down that path, then I'd agree that would make sense.
> > It doesn't disappoint me that you don't won't to replace [u]swsusp with
> > TuxOnIce. I never thought you or Pavel would want to do that.
> >
> > I did hold out a little hope that you at least might be supportive
> > anyway of getting TuxOnIce added into vanilla -
>
> That would take lot of work and we'd also have to ask many other busy people
> to do a lot of work for us. I think it's better to just avoid it at this point.
I just don't see why you think that. As I said in another reply, there's
no work for arch maintainers, very little for mm people and nothing for
filesystem maintainers in what I've sent.
> > if only so that users can get a better hibernation experience while we work
> > through merging the three into one. I'd far rather get along well with you
> > than have some sort of competitive relationship.
>
> Great, let's do our best to be productive.
Yes.
Nigel
Well, which algorithms do you have in mind in particular?
Rafael
Actually, yes. Any code that touches a subsystem has to get signed off
by that subsystem's maintainers. Witness any of the long series of
patches that touch all arches, or change out all drivers from one
method to another. Even Andrew Morton, the guy who's the declared 2.6
kernel maintainer, has to split his patches up by subsystem or
lieutenant and push them forward via them and their trees.
You are being treated no differently than anyone else on here, other
than Linus himself who has the power to merge into his tree at a whim,
but even he does so very reluctantly without a signoff from the
affected subsystem people.
TuxOnIce is in a harder position than most patches, as for it to work
it needs to touch so many subsystems.
Is this annoying? I'm sure. But that's why Rafael is offering to do
the annoying part for you, the part that has never worked in the past
when your patches have been posted for comment and hopeful merging:
He's offering to be the social glue between your code and the forms
that are accepted and followed here on LKML. Taking things apart from
the whole, finding the pieces that are non-controversial or easily
argued for, getting them merged upstream with a user, and then moving
on to the rest.
This way, the external TuxOnIce patch set shrinks and shrinks, until
it's eventually gone, with all functionality merged into the kernel in
one form or another.
Is your code better than uswsusp? Almost certainly. This isn't about
that. This is about making your code better than what it is today, by
going through the existing review-and-merge process.
IBM had to do it with their Device Mapper feature set they tried to
drop into the kernel, and the community said "Whoa" the same way
they're reacting to TuxOnInce (Note: Not you, the code.) IBM went off,
wrote things that intergrated in with the existing codebase, and got
it merged with the signoffs of the subsystems affected. They're a big
corp, and even they had to play by the existing rules.
Everybody does this here, it's the way things work, because the process *works*.
I personally want to see a better hibernation system in the kernel,
and I personally think it's going to be substantially similar to what
you have today. I also personally have no control over what gets
merged, so hopefully you'll read the above with some care and thought.
Please stick at this.
The case of two 1394 driver stacks isn't fully comparable. I supported
the addition of the second stack with the intent to replace the existing
one after a transition period, and in the hope that the ratio of
available maintainer manpower : size and complexity of the codebase
would improve considerably. Because that was/is the single fundamental
problem in Linux 1394 driver land.
(Besides, how well is infrastructure like the suspend--resume framework
comparable to drivers?)
--
Stefan Richter
-=====-=-=== -=-= -==-=
http://arcgraph.de/sr/
On Sat, May 9, 2009 at 12:59 AM, Nigel Cunningham <ni...@tuxonice.net> wrote:
> Why? We have plenty of history of having multiple implementations of
> things (slub, slab and slob...).
Yes, so please don't make the same mistake we did. Once you have
multiple implementations in the kernel, it's extremely hard to get rid
of them.
> > > > Actually, I see advantages of working together versus fighting flame wars.
> > > > Please stop that, I'm not going to take part in it this time.
> > >
> > > Ok, so what do you propose? Merging tuxonice into 2.6.32, resulting in
> > > having swsusp,uswsusp *and* tuxonice to maintain? I hope not.
> > >
> > > If we are talking about improving mainline to allow tuxonice
> > > functionality... then yes, that sounds reasonable.
> >
> > I'd like to see use have all three for one or two releases of vanilla,
> > just to give time to work out any issues that haven't been foreseen.
> > Once we're all that there are confident there are no regressions with
> > TuxOnIce, I'd remove swsusp. That's my ideal plan of attack.
>
> So this is an idea to replace our current hibernation implementation with
> TuxOnIce.
>
> Which unfortunately I don't agree with.
>
> I think we can get _one_ implementation out of the three, presumably keeping
> the user space interface that will keep the current s2disk binaries happy, by
> merging TuxOnIce code _gradually_. No "all at once" approach, please.
>
> And by "merging" I mean _exactly_ that. Not adding new code and throwing
> away the old one.
>
> While I can work on creating one hibernation implementation by taking the
> best ideas from all of the implementation we have at hand, I surely won't be
> working on replacing our current code with TuxOnIce. If that disappoints you,
> then I'm sorry.
FWIW, I agree with Rafael here. Improving the current code in
reasonable steps is the way to go.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
> > And we have different ideas about how things should be done. Userspace
> > vs kernel space. Providing tuning knobs vs not. And so on.
>
> This isn't _that_ important. Actually, I'm not against an entirely in-kernel
> solution, as there are some clear benefits of doing it this way. We only
> need to be careful enough not to break the existing setups.
Would you elaborate?
I would really hate to put progressbar painting into kernel; and if
that's in userspace, we can do compression/encryption there too as
well....
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
> > > > > I'd like to see use have all three for one or two releases of vanilla,
> > > > > just to give time to work out any issues that haven't been foreseen.
> > > > > Once we're all that there are confident there are no regressions with
> > > > > TuxOnIce, I'd remove swsusp. That's my ideal plan of attack.
> > > >
> > > > So this is an idea to replace our current hibernation implementation with
> > > > TuxOnIce.
> > >
> > > That's my ideal. I know you and Pavel don't want to see us go down that
> > > path, but I was asked "What do you propose?" and I answered that
> > > question.
> >
> > So you could also easily anticipate my reaction. :-)
> >
> > Quite frankly, I don't really think this is realistic.
> >
> > First, because it is technically too difficult to have all of the code reviewed
> > and _agreed_ _upon_ by everyone at once. And if it's not agreed upon, you'll
> > have to modify it and it won't be the same thing any more once you've done
> > that. Which I'd say is guaranteed, having had a quick look at the code.
>
> I think you're putting unrealistic barriers in the way. Does all code
> that goes into the kernel get "reviewed and agreed upon by everyone at
> once"? No! That's why bugs get in that have to be fixed and why things
> go in despite the disagreement of some people. If there are good,
I don't think Rafael is putting any barriers in your way.
> This is going to sound arrogant, but please don't take it that way: I
> can't see any other way of putting it. I don't *think* my code is
> better. It is better. swsusp has essentially stood still since Pavel
> first forked the code and got it merged. Yes, you have done some
> great
I don't think _I_ forked anything.
> work on improving the code too and yes, you've done your work on the
> version that was already merged. But your changes been more in the area
> of fixing/improving what's already there than adding new and useful
> features. On the other side, I've continued to improve the code,
Yes, that's fair. We kept incremental fixing, improving. On the other
hand, you added new features.
> new features (support for multiple swap partitions & files, for writing
> to ordinary files, for mulithreaded I/O etc etc) making it more useful
> and more reliable. There are some new features that have been put in
> swsusp, but in just about every case (I think there might be an
> exception or two), they're things TuxOnIce had for ages before. eg: SMP
> support came with cpu hotplugging in 2.6.12 or so. TuxOnIce had SMP
> support in 2.4.
You were moving faster because you did not have to move in small
incremental steps, and you were allowed to add temporary hacks into
the code. Is that surprising? Not to me.
[Please update http://www.tuxonice.net/features pages. They are
misleading; yes, uswsusp supports threaded writes, can be reconfigured
without rebooting and yes we did test failure paths, it can be
scripted, and it supports checksums. I don't know what you mean by
kexec support, but kexec/kjump could be used as whole another method
of hibernating a machine, basically adding fourth row to your table.]
> > > I take a modular approach and you have everything hardwired.
> >
> > That's because using modules woudn't really make sense for us. :-)
>
> I'm not saying modules but modular. TuxOnIce has support for compression
> neatly abstracted into one file, for swap in another and so on.
> [u]swsusp doesn't.
uswsusp has compression neatly abstracted into userland. I still
believe that's superior to kernel module.
> > > It doesn't disappoint me that you don't won't to replace [u]swsusp with
> > > TuxOnIce. I never thought you or Pavel would want to do that.
> > >
> > > I did hold out a little hope that you at least might be supportive
> > > anyway of getting TuxOnIce added into vanilla -
> >
> > That would take lot of work and we'd also have to ask many other busy people
> > to do a lot of work for us. I think it's better to just avoid it at this point.
>
> I just don't see why you think that. As I said in another reply, there's
> no work for arch maintainers, very little for mm people and nothing for
> filesystem maintainers in what I've sent.
Mainline [u]swsusp does not have ability to save all the memory,
because that code was deemed too hard to review by mm people. At that
time, that piece of code was nicely separated 300 line diff.
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
One benefit is that we need not anything in the initrd for hibernation to work.
Another one is that we can get superior performance, for obvious reasons
(less copying of data, faster I/O). Yet another is simpler configuration and
no need to maintain a separate set of user space tools. I probably could
find more.
> I would really hate to put progressbar painting into kernel; and if
> that's in userspace, we can do compression/encryption there too as
> well....
That's correct, we can. But since we have LZO in the kernel now, we can use
it for compression just as well, can't we?
Thanks,
Rafael
Just a quick note to apologise for going quiet - I had to go do 'real
life' yesterday, and am going to be busy for most of the week. I'll get
around to answering all the email eventually!
Nigel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
uswsusp supports compression, see suspend.sf.net. Oh and yes you can
interrupt it (with del key iirc).
Pavel
>>> To summarise disadvantages:
>>>
>>> - only core has 8000 LoC
>>> - it does stuff that can be easily done in userspace
>>> (and that todays distros _do_ in userspace).
>>> - it duplicates uswsusp functionality.
>>> - compared to [u]swsusp, it received little testing
>>>
>>
>> To summarise advatages - for me tuxonice is the only hibernation method that works.
>> (Till now I've had 3 machines - no one of them able to resume with in-kernel swsusp.)
>>
> Just to add my 2 cents as a user of TOI. Every distro and release I've
> tried has one major issue with kernel hibernation. Upon resume when
> hibernating large images, there's a residual footprint in swap. Every
> further hibernation creates a larger footprint, to the order of an
> additional 5-7% each time. Nobody has ever cared in any forum to
> explain why or how I might change that.
Maybe you should explain why it is a 'major issue'?
(Use swapoff -a; swapon -a to 'cure' it).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
Which kernels you tried, what hw it is? Can you do s2ram?
Great advice if you have used memory > physical memory!
Just run two firefox's and thunderbird and you will get that pretty quickly:
# free
total used free shared buffers cached
Mem: 2060560 2040656 19904 0 31452 360580
Swap: 5358296 1475504 3882792
> Pavel
Regarding hardware -
1. AMD Athlon 1700+, VIA KT333 chipset based MB, nVidia GeForce 2
2. AMD X2 Athlon, AMD690V chipset based MB, nVidia 7600GT
3. (current) AMD X4 Phenom II, AMD780G chipset and nVidia 9600GT
(binary driver for video card every time)
Recently I tried kernels 2.6.26, 2.6.27 and now 2.6.29. Kernel 2.6.26 hibernated, but booted
up without recognizing image, so no attempt to resume was made.
Kernels 2.6.27 and 2.6.29 are able to resume, but thats about it - after resuming the box
doesn't respond to keyboard and I have to reset it.
s2ram doesn't work as well. So far my best result is one succesful suspend/resume cycle;
second attempt to do the same ended in black screen and the computer became "confused" -
reset (and I mean hardware reset with button on the case) didn't work - I had to turn it off
completely.
I have been using hibernation for quite some time (starting with kernel 2.6.14 or so)
and never managed to get swsusp working. On the other hand tuxonice (and swsusp2
before the renaming) is - except for some glitches - working all that time.
I admit this is completely user's point of view, but I wanted to point out, that for some of us
tuxonice is currenty way to go.
Regards
JB
> >> >To summarise disadvantages:
> >> >
> >> >- only core has 8000 LoC
> >> >- it does stuff that can be easily done in userspace
> >> > (and that todays distros _do_ in userspace).
> >> >- it duplicates uswsusp functionality.
> >> >- compared to [u]swsusp, it received little testing
> >> >
> >>
> >> To summarise advatages - for me tuxonice is the only hibernation method that works.
> >> (Till now I've had 3 machines - no one of them able to resume with in-kernel swsusp.)
> >>
> >
> >Which kernels you tried, what hw it is? Can you do s2ram?
>
> Regarding hardware -
>
> 1. AMD Athlon 1700+, VIA KT333 chipset based MB, nVidia GeForce 2
> 2. AMD X2 Athlon, AMD690V chipset based MB, nVidia 7600GT
> 3. (current) AMD X4 Phenom II, AMD780G chipset and nVidia 9600GT
> (binary driver for video card every time)
Ok, binary drivers may be a problem. Will it work without that? Should
be easy to test with init=/bin/bash.
Pavel
Cool. I'm afraid you have to ask nvidia for help here. (Or maybe you
can just unload nvidia.ko before suspend/reload it after
resume). Anyway problem is with nvidia here..
Yes, that works - and after complete boot w/o loading binary driver resume works too.
And that's where we get to the advantage of TOI again: with TOI I don't have to do that,
suspend works with X.org running. (IMO there's no point in suspending if I have
to start all X apps again)
Anyway, I think this thread is a bit off-topic in -devel list, so if there isn't anything
important to add, this will be the last post from me.
Regards
JB
> > > > And we have different ideas about how things should be done. Userspace
> > > > vs kernel space. Providing tuning knobs vs not. And so on.
> > >
> > > This isn't _that_ important. Actually, I'm not against an entirely in-kernel
> > > solution, as there are some clear benefits of doing it this way. We only
> > > need to be careful enough not to break the existing setups.
> >
> > Would you elaborate?
>
> One benefit is that we need not anything in the initrd for hibernation to work.
> Another one is that we can get superior performance, for obvious reasons
> (less copying of data, faster I/O). Yet another is simpler configuration and
> no need to maintain a separate set of user space tools. I probably could
> find more.
>
> > I would really hate to put progressbar painting into kernel; and if
> > that's in userspace, we can do compression/encryption there too as
> > well....
>
> That's correct, we can. But since we have LZO in the kernel now, we can use
> it for compression just as well, can't we?
Yes, but we do not have progressbar painting in the kernel -- yet --
so users will still need initrd etc.
Yes, we can move LZO into kernel pretty cheaply, and it will have
minor benefit of slightly faster reguler swsusp, but...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
As a user I support this.
Why?
Cause everytime I tried TuxOnIce just worked while I had various problems
whenever I tried out any in-kernel snapshot stuff, be it hard-wired or
userspace supported. Actually I never could have been bothered to try out
to fix any issues with these while I just had a working solution and
thats TuxOnIce. Might have been that issues could have been fixed - but
exactly why should I care when I have something that just works? Honestly
I can't even be bothered to remember those issues in detail. It was
crashes, hangs and on the last occurence of testing mostly slowness while
it basically worked mostly. Release versions of TuxOnIce didn't fail for
me as long as I can remember.
My case for TuxOnIce?
shambhala:~> cat /sys/power/tuxonice/debug_info
TuxOnIce debugging info:
- TuxOnIce core : 3.0.1
- Kernel Version : 2.6.29.2-tp42-toi-3.0.1
- Compiler vers. : 4.3
- Attempt number : 39
- Parameters : 0 667656 0 1 700 5
- Overall expected compression percentage: 30.
- Checksum method is 'md4'.
0 pages resaved in atomic copy.
- Compressor is 'lzo'.
Compressed 528949248 bytes into 223456685 (57 percent compression).
- Max outstanding reads 570. Max writes 132.
Memory_needed: 1024 x (4096 + 216 + 72) = 4489216 bytes.
Free mem throttle point reached 0.
- SwapAllocator active.
Swap available for image: 661830 pages.
- FileAllocator inactive.
- I/O speed: Write 41 MB/s, Read 35 MB/s.
- Extra pages : 18 used/500.
- Result : Succeeded.
39 attempts and counting.
I have seen uptimes of up to almost 70 days and this one has only been
interrupted by user error - shutting down instead of triggering snapshot.
shambhala:~> uprecords | head -12 | cut -b1-56
# Uptime | System
----------------------------+---------------------------
1 31 days, 01:07:24 | Linux 2.6.26.5-tp42-toi-
-> 2 17 days, 21:47:04 | Linux 2.6.29.2-tp42-toi-
3 17 days, 12:38:36 | Linux 2.6.28.8-tp42-toi-
4 15 days, 14:39:26 | Linux 2.6.28.4-tp42-toi-
5 15 days, 13:58:12 | Linux 2.6.27.7-tp42-toi-
6 12 days, 21:54:18 | Linux 2.6.26.5-tp42-toi-
7 10 days, 22:02:14 | Linux 2.6.28.7-tp42-toi-
8 10 days, 08:04:52 | Linux 2.6.26.2-tp42-toi-
9 8 days, 00:34:34 | Linux 2.6.28.7-tp42-toi-
10 7 days, 12:56:54 | Linux 2.6.28.8-tp42-toi-
(uprecords cuts kernel version, so concrete TuxOnIce versions are missing.
Including low uptimes with 2.6.28 due to hard crashes during preparing
hibernation when the switch from X11 to console frame buffer was about to
come - which luckily appear to have gone with 2.6.29. And from time to
time I can be bothered to upgrade the kernel as well.)
And the speed - which even got higher after the switch from LZF to
in-kernel LZO:
shambhala:~> zgrep "I/O speed" /var/log/syslog | sed 's/localhost //' |
tail -10
May 10 18:15:09 kernel: - I/O speed: Write 49 MB/s, Read 48 MB/s.
May 11 09:37:31 kernel: - I/O speed: Write 49 MB/s, Read 45 MB/s.
May 12 08:22:55 kernel: - I/O speed: Write 46 MB/s, Read 45 MB/s.
May 12 19:40:25 kernel: - I/O speed: Write 45 MB/s, Read 42 MB/s.
May 13 09:03:34 kernel: - I/O speed: Write 44 MB/s, Read 35 MB/s.
May 13 19:21:31 kernel: - I/O speed: Write 50 MB/s, Read 39 MB/s.
May 14 08:51:45 kernel: - I/O speed: Write 46 MB/s, Read 38 MB/s.
May 15 08:52:53 kernel: - I/O speed: Write 46 MB/s, Read 40 MB/s.
May 16 12:56:10 kernel: - I/O speed: Write 53 MB/s, Read 54 MB/s.
May 16 19:08:55 kernel: - I/O speed: Write 41 MB/s, Read 35 MB/s.
Thats on ThinkPad T42 with 160 GB Hitachi IDE drive.
My conclusion: TuxOnIce is awesome.
I do not add anything more to the general discussion. I am fine with
TuxOnIce replacing in-kernel snapshot implementation. I am also fine with
TuxOnIce serving as inspiration to make in-kernel snapshot better. But I
for my take can't be bothered to waste my precious time to try out
in-kernel stuff again unless I can be convinced that it might have a
competitive reliability, speed and feature set. In the role of an user I
can be very lazy. TuxOnIce in kernel would convince me, thats for sure.
I am pretty puzzled that apparently apart from Sam Ravnborg no kernel
developer made any public review of concrete patches.
I think the effort of Nigel deserves a bit more than general comments and
the usual userspace versus in kernel discussion. At least some hints for
Nigel for what he should change in general in order to improve the
likelihood of a review. In the moment it appears to me that it has been
rejected without even looking at it.
So what are the *concrete* issues with the patchset Nigel posted?
I respect that kernel developers have the right to be lazy as well ;). And
I am free to compile my own kernels for as long as I see fit with
TuxOnIce being probably my only reason to do so.
But I ask for at least some concrete feedback on the concrete patchset
Nigel posted, instructions for Nigel on what to change / do differently -
apart from that general and repetitive discussion. Does the patchset need
to be smaller? Should patches be split or joined? Should comments be
improved? Should the patchset be structured differently?
And when replacing in kernel snapshot implementation by TuxOnIce is
completely out of question - even without looking at the concrete
patchset - I ask for concrete hints on alternative approaches Nigel could
follow.
--
Martin 'Helios' Steigerwald - http://www.Lichtvoll.de
GPG: 03B0 0D6C 0040 0710 4AFA B82F 991B EAAC A599 84C7
Just a very small comment below.
2009/5/6 Nigel Cunningham <ni...@tuxonice.net>:
> This patch creates new Documentation/power files for describing
> TuxOnIce, and a MAINTAINERS entry for the code.
>
> Signed-off-by: Nigel Cunningham <ni...@tuxonice.net>
> ---
> Â Documentation/power/tuxonice-internals.txt | Â 477 ++++++++++++++
>  Documentation/power/tuxonice.txt      |  924 ++++++++++++++++++++++++++++
> Â MAINTAINERS Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â Â | Â Â 7 +
> Â 3 files changed, 1408 insertions(+), 0 deletions(-)
> Â create mode 100644 Documentation/power/tuxonice-internals.txt
> Â create mode 100644 Documentation/power/tuxonice.txt
[...]
> + Â Most people will only want to hibernate to a local file. To achieve that, do
> + Â something along the lines of:
> +
> + Â echo "TuxOnIce" > /hibernation-file
> + Â dd if=/dev/zero bs=1M count=512 >> hibernation-file
It seems that a / is missing here?
> +
> + Â This will create a 512MB file called /hibernation-file. To get TuxOnIce to use
> + Â it:
> +
> + Â echo /hibernation-file > /sys/power/tuxonice/file/target
Vegard
On Sat, 2009-05-16 at 22:10 +0200, Vegard Nossum wrote:
> Hi,
>
> Just a very small comment below.
Thanks very much!
Nigel
(Starting to catch up after a week away)
On Thu, 2009-05-14 at 11:16 +0200, Pavel Machek wrote:
> Hi!
>
> > > > > And we have different ideas about how things should be done. Userspace
> > > > > vs kernel space. Providing tuning knobs vs not. And so on.
> > > >
> > > > This isn't _that_ important. Actually, I'm not against an entirely in-kernel
> > > > solution, as there are some clear benefits of doing it this way. We only
> > > > need to be careful enough not to break the existing setups.
> > >
> > > Would you elaborate?
> >
> > One benefit is that we need not anything in the initrd for hibernation to work.
> > Another one is that we can get superior performance, for obvious reasons
> > (less copying of data, faster I/O). Yet another is simpler configuration and
> > no need to maintain a separate set of user space tools. I probably could
> > find more.
> >
> > > I would really hate to put progressbar painting into kernel; and if
> > > that's in userspace, we can do compression/encryption there too as
> > > well....
> >
> > That's correct, we can. But since we have LZO in the kernel now, we can use
> > it for compression just as well, can't we?
>
> Yes, but we do not have progressbar painting in the kernel -- yet --
> so users will still need initrd etc.
Who does?
> Yes, we can move LZO into kernel pretty cheaply, and it will have
> minor benefit of slightly faster reguler swsusp, but...
LZO is already in the kernel (a cryptoapi module). The result won't be
slightly faster - it will be (assuming the CPU is fast enough) slightly
better than double the speed, on average.
Regards,
Nigel
one of the real frustrations i've had watching this process from the
sidelines is that those with the authority to make decisions have
never taken either of these very important concerns seriously. And
until they do, I do think it's quite likely that suspend-to-disk will
continue in its largely-broken state for quite some time to come.
matt
On Sat, 2009-05-16 at 22:53 -0400, Matt Price wrote:
> one of the real frustrations i've had watching this process from the
> sidelines is that those with the authority to make decisions have
> never taken either of these very important concerns seriously. And
> until they do, I do think it's quite likely that suspend-to-disk will
> continue in its largely-broken state for quite some time to come.
I just want to talk a little in defence of Rafael - I've found him
really good to deal with. Yes, he has been headed in a different
direction, but he's not unreasonable and he is responsive to bug reports
and suggestions. Please don't think less of him than you ought.
Regards,
Nigel
didn't really mean to personalize my comments, which on rereading it
appears i may have done. all i meant to say was that these other
concerns are real issues that make a real difference to the usability
of the code. and i think it's a shame they aren't (perhaps can't be)
taken into account in these code reviews.
though, echoing martin again, at least in these public threads i've
seen very little specific review of the patches themselves. i only
hope this is happening on other channels.
matt
Might have been nicer if your RFC message actually quickly described
what TuxOnIce actually is :-)
Cheers,
Ben.
On Sat, 2009-05-09 at 15:58 +0200, Pavel Machek wrote:
> Hi!
>
> > > Instead of new features I would rather see more effort being put into making
> > > the _core_ TuxOnIce (I mean patch #8 here) smaller (8 KLOC is still a lot,
> > > just to put things into the right perspective the current in-kernel content
> > > of kernel/power/ is 5.5 KLOC) and with more documentation inside the code.
> >
> > Yeah, but those 2.5k extra lines get you more reliability and extra
> > functionality. They're not fat.
>
> If you know about reliability problems in swsusp, please fix them in
> separate patch. Hiding the fixes in 8KLOC patch is not nice.
I'm going to try to. Unfortunately, they'll require what's basically a
group-up redesign of the basic algorithm, because to get maximum
reliability, you need to carefully account for the amount of storage
you're going to need and the amount of memory you have available, and
'prepare' the image prior to doing the atomic copy.
Regards,
Nigel
On Sat, 2009-05-09 at 15:54 +0200, Pavel Machek wrote:
> > This is going to sound arrogant, but please don't take it that way: I
> > can't see any other way of putting it. I don't *think* my code is
> > better. It is better. swsusp has essentially stood still since Pavel
> > first forked the code and got it merged. Yes, you have done some
> > great
>
> I don't think _I_ forked anything.
The conversation for May 2002 (around when you got it merged into
vanilla) is here:
Not sure why Sourceforge wants you to log in to get at it.
> > work on improving the code too and yes, you've done your work on the
> > version that was already merged. But your changes been more in the area
> > of fixing/improving what's already there than adding new and useful
> > features. On the other side, I've continued to improve the code,
>
> Yes, that's fair. We kept incremental fixing, improving. On the other
> hand, you added new features.
>
> > new features (support for multiple swap partitions & files, for writing
> > to ordinary files, for mulithreaded I/O etc etc) making it more useful
> > and more reliable. There are some new features that have been put in
> > swsusp, but in just about every case (I think there might be an
> > exception or two), they're things TuxOnIce had for ages before. eg: SMP
> > support came with cpu hotplugging in 2.6.12 or so. TuxOnIce had SMP
> > support in 2.4.
>
> You were moving faster because you did not have to move in small
> incremental steps, and you were allowed to add temporary hacks into
> the code. Is that surprising? Not to me.
No, I moved in small incremental steps too - and the odd big rework.
> [Please update http://www.tuxonice.net/features pages. They are
> misleading; yes, uswsusp supports threaded writes, can be reconfigured
> without rebooting and yes we did test failure paths, it can be
> scripted, and it supports checksums. I don't know what you mean by
> kexec support, but kexec/kjump could be used as whole another method
> of hibernating a machine, basically adding fourth row to your table.]
uswsusp supports multithreaded I/O? Wow. When did that happen?
Okay. How do you reconfigure it without rebooting (I mean tell it to
write the image to a different location)? (So I can put the instructions
on the page).
Regarding kexec, I'm thinking about making TuxOnice able to do a kexec
jump and then continue with writing the image (after preparing it in the
original kernel).
(Note to self for later - look above for other things Pavel says uswsusp
can do when updating the page).
> > > > I take a modular approach and you have everything hardwired.
> > >
> > > That's because using modules woudn't really make sense for us. :-)
> >
> > I'm not saying modules but modular. TuxOnIce has support for compression
> > neatly abstracted into one file, for swap in another and so on.
> > [u]swsusp doesn't.
>
> uswsusp has compression neatly abstracted into userland. I still
> believe that's superior to kernel module.
Okay, but what about swap support? Modifying swsusp or uswsusp to write
to ordinary files would require a huge change in multiple places - where
you store the image isn't currently abstracted at all from the issue of
what you're storing, and I dare say a person with a slow computer who
gets no advantage out of compression will have to recompile uswsusp to
turn it off (if that's allowed for).
> > > > It doesn't disappoint me that you don't won't to replace [u]swsusp with
> > > > TuxOnIce. I never thought you or Pavel would want to do that.
> > > >
> > > > I did hold out a little hope that you at least might be supportive
> > > > anyway of getting TuxOnIce added into vanilla -
> > >
> > > That would take lot of work and we'd also have to ask many other busy people
> > > to do a lot of work for us. I think it's better to just avoid it at this point.
> >
> > I just don't see why you think that. As I said in another reply, there's
> > no work for arch maintainers, very little for mm people and nothing for
> > filesystem maintainers in what I've sent.
>
> Mainline [u]swsusp does not have ability to save all the memory,
> because that code was deemed too hard to review by mm people. At that
> time, that piece of code was nicely separated 300 line diff.
Okay. That will be a nicely separated diff later too (assuming I get
that far), but the groundwork will be laid well before that little diff
goes in.
Regards,
Nigel
On Sat, 2009-05-09 at 01:43 +0200, Rafael J. Wysocki wrote:
> > On Sat, 2009-05-09 at 00:46 +0200, Rafael J. Wysocki wrote:
> > > On Friday 08 May 2009, Nigel Cunningham wrote:
> > > > On Fri, 2009-05-08 at 16:11 +0200, Rafael J. Wysocki wrote:
> > > > > On Friday 08 May 2009, Nigel Cunningham wrote:
> > > > And the code includes some fundamental differences. I freeze processes
> > > > and prepare the whole image before saving anything or doing an atomic
> > > > copy whereas you just free memory before doing the atomic copy. You save
> > > > everything in one part whereas I save the image in two parts.
> > >
> > > IMO the differences are not that fundamental. The whole problem boils down
> > > to using the same data structures for memory management and I think we can
> > > reach an agreement here.
> >
> > I think we might be able to agree on using the same data structures, but
> > I'm not so sure about algorithms - I think you're underestimating the
> > differences here.
>
> Well, which algorithms do you have in mind in particular?
Sorry for the slow reply - just starting to catch up after time away.
The main difference is the order of doing things. TuxOnIce prepares the
image after freezing processes and before the atomic copy. It doesn't
just do that so that it can store a complete image of memory. It also
does it because once processes are frozen, the only thing that's going
to allocate storage is TuxOnIce, and the only things that are going to
allocate RAM are TuxOnIce and the drivers' suspend routines. The
drivers' routines are pretty consistent - once you've seen how much is
used for one invocation, you can add a small margin and call that the
allowance to use for all future invocations. The amount of memory used
by the hibernation code is also entirely predictable - once you know the
characteristics of the system as it stands (ie with processes frozen),
you know how much you're going to need for the atomic copy and for doing
I/O. If you find that something is too big, all you need to do is thaw
kernel threads and free some memory until you fit within constraints or
(heaven forbid!) find that you're not getting anyway and so want to give
up on hibernating all together.
If, on the other hand, you do the drivers suspend etc and then look to
see what state you're in, well you might need to thaw drivers etc in
order to free memory before trying again. It's more expensive. Right now
you're just giving up in that case - yes, you could retry too instead of
giving up completely, but it's better IMHO to seek to get things right
before suspending drivers.
Oh, before I forget to mention and you ask - how to know what allowance
for the drivers? I use a sysfs entry - the user then just needs to see
what's needed on their first attempt, set up a means of putting that
value in the sysfs file in future (eg /etc/hibernate/tuxonice.conf) and
then forget about it.
I don't quite get it; why is that needed?
If there's not enough swap available, swsusp should freeze, realize
there's no swap, unfreeze and continue. I do not see reliability
problem there.
If there are some problems like "it crashes during suspend or resume"
or "suspend succeeds but fails to resume", we'd like to fix those with
priority...
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
I do believe needed ammount of memory can differ between between
different suspend attempts. I'd say difference between no X, X w/ 3D,
X w/ active 3D could be significant.
Plus... swsusp gets without configuring that, and I'd prefer as little
tunables as possible. (In swsusp case, 4MB is reserved for drivers,
and it is a bug if they need more).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Mon, 2009-05-25 at 14:43 +0200, Pavel Machek wrote:
> > Oh, before I forget to mention and you ask - how to know what allowance
> > for the drivers? I use a sysfs entry - the user then just needs to see
> > what's needed on their first attempt, set up a means of putting that
> > value in the sysfs file in future (eg /etc/hibernate/tuxonice.conf) and
> > then forget about it.
>
> I do believe needed ammount of memory can differ between between
> different suspend attempts. I'd say difference between no X, X w/ 3D,
> X w/ active 3D could be significant.
Yes, but people don't generally change their X configuration. There
would certainly be less memory required without X, but it doesn't hurt
to allow for too many pages being allocated.
> Plus... swsusp gets without configuring that, and I'd prefer as little
> tunables as possible. (In swsusp case, 4MB is reserved for drivers,
> and it is a bug if they need more).
Perhaps this is one of the causes of swsusp (and uswsusp) not working
for people. 4MB is often enough, but often not enough. Actually,
thinking some more about it, it's probably not a problem for swsusp and
uswsusp simply because you have large amounts of memory free anyway. I
agree that they probably shouldn't need more, but wouldn't call it a bug
if more is requested - it's an arbitrary limit.
Regards,
Nigel
The software suspend may be a part of your response to an imminent
power failure (UPS near empty). The number of retries available is possibly
limited.
I'd feel safer if hibernation by default wrote to a dedicated partition,
especially as modern practice is to make swap space smaller than RAM.
Regards
Oliver
If there's no swap (and no hibernation partition), s2disk just will
not work.
> I'd feel safer if hibernation by default wrote to a dedicated partition,
> especially as modern practice is to make swap space smaller than RAM.
It would be easy to have dedicated partition. But why waste space on
it?
Anyway, this debate here is "in what order should we do the swsusp
actions". Dedicated partition/etc is for separate thread (please).
Pavel
--
(english) http://www.livejournal.com/~pavelmachek
(cesky, pictures) http://atrey.karlin.mff.cuni.cz/~pavel/picture/horses/blog.html
On Mon, 2009-05-25 at 14:32 +0200, Pavel Machek wrote:
> On Mon 2009-05-25 19:27:46, Nigel Cunningham wrote:
> > Hi.
> >
> > On Sat, 2009-05-09 at 15:58 +0200, Pavel Machek wrote:
> > > Hi!
> > >
> > > > > Instead of new features I would rather see more effort being put into making
> > > > > the _core_ TuxOnIce (I mean patch #8 here) smaller (8 KLOC is still a lot,
> > > > > just to put things into the right perspective the current in-kernel content
> > > > > of kernel/power/ is 5.5 KLOC) and with more documentation inside the code.
> > > >
> > > > Yeah, but those 2.5k extra lines get you more reliability and extra
> > > > functionality. They're not fat.
> > >
> > > If you know about reliability problems in swsusp, please fix them in
> > > separate patch. Hiding the fixes in 8KLOC patch is not nice.
> >
> > I'm going to try to. Unfortunately, they'll require what's basically a
> > group-up redesign of the basic algorithm, because to get maximum
> > reliability, you need to carefully account for the amount of storage
> > you're going to need and the amount of memory you have available, and
> > 'prepare' the image prior to doing the atomic copy.
>
> I don't quite get it; why is that needed?
>
> If there's not enough swap available, swsusp should freeze, realize
> there's no swap, unfreeze and continue. I do not see reliability
> problem there.
If there's not enough storage available (I'm also thinking of the file
allocator Oliver wants), freeing some memory may get you in a position
where you can hibernate. It makes sense to try to calculate how much
memory you need to free, thaw kernel threads (but not userspace), seek
to free that memory and try again - especially once we get a
shrink_all_memory replacement/rework that actually gives you what you
ask for if that's possible (that was the point to the extra code I used
to have in vmscan.c).
Regards,
Nigel
Hi,
> On Sat, 2009-05-09 at 01:43 +0200, Rafael J. Wysocki wrote:
> > > On Sat, 2009-05-09 at 00:46 +0200, Rafael J. Wysocki wrote:
> > > > On Friday 08 May 2009, Nigel Cunningham wrote:
> > > > > On Fri, 2009-05-08 at 16:11 +0200, Rafael J. Wysocki wrote:
> > > > > > On Friday 08 May 2009, Nigel Cunningham wrote:
> > > > > And the code includes some fundamental differences. I freeze processes
> > > > > and prepare the whole image before saving anything or doing an atomic
> > > > > copy whereas you just free memory before doing the atomic copy. You save
> > > > > everything in one part whereas I save the image in two parts.
> > > >
> > > > IMO the differences are not that fundamental. The whole problem boils down
> > > > to using the same data structures for memory management and I think we can
> > > > reach an agreement here.
> > >
> > > I think we might be able to agree on using the same data structures, but
> > > I'm not so sure about algorithms - I think you're underestimating the
> > > differences here.
> >
> > Well, which algorithms do you have in mind in particular?
>
> Sorry for the slow reply - just starting to catch up after time away.
NP
> The main difference is the order of doing things. TuxOnIce prepares the
> image after freezing processes and before the atomic copy. It doesn't
> just do that so that it can store a complete image of memory. It also
> does it because once processes are frozen, the only thing that's going
> to allocate storage is TuxOnIce,
This is quite strong statement. Is it provable?
> and the only things that are going to allocate RAM are TuxOnIce and the
> drivers' suspend routines.
Hmm. What about kernel threads that are not frozen?
> The drivers' routines are pretty consistent - once you've seen how much is
> used for one invocation, you can add a small margin and call that the
> allowance to use for all future invocations. The amount of memory used
> by the hibernation code is also entirely predictable - once you know the
> characteristics of the system as it stands (ie with processes frozen),
> you know how much you're going to need for the atomic copy and for doing
> I/O. If you find that something is too big, all you need to do is thaw
> kernel threads and free some memory until you fit within constraints or
> (heaven forbid!) find that you're not getting anyway and so want to give
> up on hibernating all together.
>
> If, on the other hand, you do the drivers suspend etc and then look to
> see what state you're in, well you might need to thaw drivers etc in
> order to free memory before trying again. It's more expensive. Right now
> you're just giving up in that case - yes, you could retry too instead of
> giving up completely, but it's better IMHO to seek to get things right
> before suspending drivers.
>
> Oh, before I forget to mention and you ask - how to know what allowance
> for the drivers? I use a sysfs entry - the user then just needs to see
> what's needed on their first attempt, set up a means of putting that
> value in the sysfs file in future (eg /etc/hibernate/tuxonice.conf) and
> then forget about it.
OK, this is reasonable.
Still, I think your approach is based on some assumptions that need to be
verified, so that either we are 100% sure they are satisfied, or we have some
safeguards in place in case they aren't.
Best,
Rafael