/* SPDX-License-Identifier: GPL-2.0 */
/*
 * Copyright 2019 Google LLC
 */
#ifndef _INCFS_DATA_MGMT_H
#define _INCFS_DATA_MGMT_H

#include <linux/cred.h>
#include <linux/fs.h>
#include <linux/types.h>
#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/rcupdate.h>
#include <linux/completion.h>
#include <linux/wait.h>
#include <linux/zstd.h>
#include <crypto/hash.h>
#include <linux/rwsem.h>

#include <uapi/linux/incrementalfs.h>

#include "internal.h"
#include "pseudo_files.h"

#define SEGMENTS_PER_FILE 3

enum LOG_RECORD_TYPE {
	FULL,
	SAME_FILE,
	SAME_FILE_CLOSE_BLOCK,
	SAME_FILE_CLOSE_BLOCK_SHORT,
	SAME_FILE_NEXT_BLOCK,
	SAME_FILE_NEXT_BLOCK_SHORT,
};

struct full_record {
	enum LOG_RECORD_TYPE type : 3; /* FULL */
	u32 block_index : 29;
	incfs_uuid_t file_id;
	u64 absolute_ts_us;
	uid_t uid;
} __packed; /* 32 bytes */

struct same_file {
	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE */
	u32 block_index : 29;
	uid_t uid;
	u16 relative_ts_us; /* max 2^16 us ~= 64 ms */
} __packed; /* 10 bytes */

struct same_file_close_block {
	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK */
	u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
	s16 block_index_delta;
} __packed; /* 4 bytes */

struct same_file_close_block_short {
	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_CLOSE_BLOCK_SHORT */
	u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
	s8 block_index_delta;
} __packed; /* 2 bytes */

struct same_file_next_block {
	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK */
	u16 relative_ts_us : 13; /* max 2^13 us ~= 8 ms */
} __packed; /* 2 bytes */

struct same_file_next_block_short {
	enum LOG_RECORD_TYPE type : 3; /* SAME_FILE_NEXT_BLOCK_SHORT */
	u8 relative_ts_tens_us : 5; /* max 2^5*10 us ~= 320 us */
} __packed; /* 1 byte */

union log_record {
	struct full_record full_record;
	struct same_file same_file;
	struct same_file_close_block same_file_close_block;
	struct same_file_close_block_short same_file_close_block_short;
	struct same_file_next_block same_file_next_block;
	struct same_file_next_block_short same_file_next_block_short;
};

struct read_log_state {
	/* Log buffer generation id, incremented on configuration changes */
	u32 generation_id;

	/* Offset in rl_ring_buf to write into. */
	u32 next_offset;

	/* Current number of writer passes over rl_ring_buf */
	u32 current_pass_no;

	/* Current full_record to diff against */
	struct full_record base_record;

	/* Current record number counting from configuration change */
	u64 current_record_no;
};

/* A ring buffer to save records about data blocks which were recently read. */
struct read_log {
	void *rl_ring_buf;

	int rl_size;

	struct read_log_state rl_head;

	struct read_log_state rl_tail;

	/* A lock to protect the above fields */
	spinlock_t rl_lock;

	/* A queue of waiters who want to be notified about reads */
	wait_queue_head_t ml_notif_wq;

	/* A work item to wake up those waiters without slowing down readers */
	struct delayed_work ml_wakeup_work;
};

struct mount_options {
	unsigned int read_timeout_ms;
	unsigned int readahead_pages;
	unsigned int read_log_pages;
	unsigned int read_log_wakeup_count;
	bool report_uid;
	char *sysfs_name;
};

struct mount_info {
	struct super_block *mi_sb;

	struct path mi_backing_dir_path;

	struct dentry *mi_index_dir;
	/* For stacking mounts, if true, this indicates if the index dir needs
	 * to be freed for this SB otherwise it was created by lower level SB */
	bool mi_index_free;

	struct dentry *mi_incomplete_dir;
	/* For stacking mounts, if true, this indicates if the incomplete dir
	 * needs to be freed for this SB. Similar to mi_index_free */
	bool mi_incomplete_free;

	const struct cred *mi_owner;

	struct mount_options mi_options;

	/* This mutex is to be taken before create, rename, delete */
	struct mutex mi_dir_struct_mutex;

	/*
	 * A queue of waiters who want to be notified about new pending reads.
	 */
	wait_queue_head_t mi_pending_reads_notif_wq;

	/*
	 * Protects - RCU safe:
	 *  - reads_list_head
	 *  - mi_pending_reads_count
	 *  - mi_last_pending_read_number
	 *  - data_file_segment.reads_list_head
	 */
	spinlock_t pending_read_lock;

	/* List of active pending_read objects */
	struct list_head mi_reads_list_head;

	/* Total number of items in reads_list_head */
	int mi_pending_reads_count;

	/*
	 * Last serial number that was assigned to a pending read.
	 * 0 means no pending reads have been seen yet.
	 */
	int mi_last_pending_read_number;

	/* Temporary buffer for read logger. */
	struct read_log mi_log;

	/* SELinux needs special xattrs on our pseudo files */
	struct mem_range pseudo_file_xattr[PSEUDO_FILE_COUNT];

	/* A queue of waiters who want to be notified about blocks_written */
	wait_queue_head_t mi_blocks_written_notif_wq;

	/* Number of blocks written since mount */
	atomic_t mi_blocks_written;

	/* Per UID read timeouts */
	spinlock_t mi_per_uid_read_timeouts_lock;
	struct incfs_per_uid_read_timeouts *mi_per_uid_read_timeouts;
	int mi_per_uid_read_timeouts_size;

	/* zstd workspace */
	struct mutex mi_zstd_workspace_mutex;
	void *mi_zstd_workspace;
	ZSTD_DStream *mi_zstd_stream;
	struct delayed_work mi_zstd_cleanup_work;

	/* sysfs node */
	struct incfs_sysfs_node *mi_sysfs_node;

	/* Last error information */
	struct mutex	mi_le_mutex;
	incfs_uuid_t	mi_le_file_id;
	u64		mi_le_time_us;
	u32		mi_le_page;
	u32		mi_le_errno;
	uid_t		mi_le_uid;

	/* Number of reads timed out */
	u32 mi_reads_failed_timed_out;

	/* Number of reads failed because hash verification failed */
	u32 mi_reads_failed_hash_verification;

	/* Number of reads failed for another reason */
	u32 mi_reads_failed_other;

	/* Number of reads delayed because page had to be fetched */
	u32 mi_reads_delayed_pending;

	/* Total time waiting for pages to be fetched */
	u64 mi_reads_delayed_pending_us;

	/*
	 * Number of reads delayed because of per-uid min_time_us or
	 * min_pending_time_us settings
	 */
	u32 mi_reads_delayed_min;

	/* Total time waiting because of per-uid min_time_us or
	 * min_pending_time_us settings.
	 *
	 * Note that if a read is initially delayed because we have to wait for
	 * the page, then further delayed because of min_pending_time_us
	 * setting, this counter gets incremented by only the further delay
	 * time.
	 */
	u64 mi_reads_delayed_min_us;
};

struct data_file_block {
	loff_t db_backing_file_data_offset;

	size_t db_stored_size;

	enum incfs_compression_alg db_comp_alg;
};

struct pending_read {
	incfs_uuid_t file_id;

	s64 timestamp_us;

	atomic_t done;

	int block_index;

	int serial_number;

	uid_t uid;

	struct list_head mi_reads_list;

	struct list_head segment_reads_list;

	struct rcu_head rcu;
};

struct data_file_segment {
	wait_queue_head_t new_data_arrival_wq;

	/* Protects reads and writes from the blockmap */
	struct rw_semaphore rwsem;

	/* List of active pending_read objects belonging to this segment */
	/* Protected by mount_info.pending_reads_mutex */
	struct list_head reads_list_head;
};

/*
 * Extra info associated with a file. Just a few bytes set by a user.
 */
struct file_attr {
	loff_t fa_value_offset;

	size_t fa_value_size;

	u32 fa_crc;
};


struct data_file {
	struct backing_file_context *df_backing_file_context;

	struct mount_info *df_mount_info;

	incfs_uuid_t df_id;

	/*
	 * Array of segments used to reduce lock contention for the file.
	 * Segment is chosen for a block depends on the block's index.
	 */
	struct data_file_segment df_segments[SEGMENTS_PER_FILE];

	/* Base offset of the first metadata record. */
	loff_t df_metadata_off;

	/* Base offset of the block map. */
	loff_t df_blockmap_off;

	/* File size in bytes */
	loff_t df_size;

	/* File header flags */
	u32 df_header_flags;

	/* File size in DATA_FILE_BLOCK_SIZE blocks */
	int df_data_block_count;

	/* Total number of blocks, data + hash */
	int df_total_block_count;

	/* For mapped files, the offset into the actual file */
	loff_t df_mapped_offset;

	/* Number of data blocks written to file */
	atomic_t df_data_blocks_written;

	/* Number of data blocks in the status block */
	u32 df_initial_data_blocks_written;

	/* Number of hash blocks written to file */
	atomic_t df_hash_blocks_written;

	/* Number of hash blocks in the status block */
	u32 df_initial_hash_blocks_written;

	/* Offset to status metadata header */
	loff_t df_status_offset;

	/*
	 * Mutex acquired while enabling verity. Note that df_hash_tree is set
	 * by enable verity.
	 *
	 * The backing file mutex bc_mutex  may be taken while this mutex is
	 * held.
	 */
	struct mutex df_enable_verity;

	/*
	 * Set either at construction time or during enabling verity. In the
	 * latter case, set via smp_store_release, so use smp_load_acquire to
	 * read it.
	 */
	struct mtree *df_hash_tree;

	/* Guaranteed set if df_hash_tree is set. */
	struct incfs_df_signature *df_signature;

	/*
	 * The verity file digest, set when verity is enabled and the file has
	 * been opened
	 */
	struct mem_range df_verity_file_digest;

	struct incfs_df_verity_signature *df_verity_signature;
};

struct dir_file {
	struct mount_info *mount_info;

	struct file *backing_dir;
};

struct inode_info {
	struct mount_info *n_mount_info; /* A mount, this file belongs to */

	struct inode *n_backing_inode;

	struct data_file *n_file;

	struct inode n_vfs_inode;
};

struct dentry_info {
	struct path backing_path;
};

enum FILL_PERMISSION {
	CANT_FILL = 0,
	CAN_FILL = 1,
};

struct incfs_file_data {
	/* Does this file handle have INCFS_IOC_FILL_BLOCKS permission */
	enum FILL_PERMISSION fd_fill_permission;

	/* If INCFS_IOC_GET_FILLED_BLOCKS has been called, where are we */
	int fd_get_block_pos;

	/* And how many filled blocks are there up to that point */
	int fd_filled_data_blocks;
	int fd_filled_hash_blocks;
};

struct mount_info *incfs_alloc_mount_info(struct super_block *sb,
					  struct mount_options *options,
					  struct path *backing_dir_path);

int incfs_realloc_mount_info(struct mount_info *mi,
			     struct mount_options *options);

void incfs_free_mount_info(struct mount_info *mi);

char *file_id_to_str(incfs_uuid_t id);
struct dentry *incfs_lookup_dentry(struct dentry *parent, const char *name);
struct data_file *incfs_open_data_file(struct mount_info *mi, struct file *bf);
void incfs_free_data_file(struct data_file *df);

struct dir_file *incfs_open_dir_file(struct mount_info *mi, struct file *bf);
void incfs_free_dir_file(struct dir_file *dir);

struct incfs_read_data_file_timeouts {
	u32 min_time_us;
	u32 min_pending_time_us;
	u32 max_pending_time_us;
};

ssize_t incfs_read_data_file_block(struct mem_range dst, struct file *f,
			int index, struct mem_range tmp,
			struct incfs_read_data_file_timeouts *timeouts);

ssize_t incfs_read_merkle_tree_blocks(struct mem_range dst,
				      struct data_file *df, size_t offset);

int incfs_get_filled_blocks(struct data_file *df,
			    struct incfs_file_data *fd,
			    struct incfs_get_filled_blocks_args *arg);

int incfs_read_file_signature(struct data_file *df, struct mem_range dst);

int incfs_process_new_data_block(struct data_file *df,
				 struct incfs_fill_block *block, u8 *data);

int incfs_process_new_hash_block(struct data_file *df,
				 struct incfs_fill_block *block, u8 *data);

bool incfs_fresh_pending_reads_exist(struct mount_info *mi, int last_number);

/*
 * Collects pending reads and saves them into the array (reads/reads_size).
 * Only reads with serial_number > sn_lowerbound are reported.
 * Returns how many reads were saved into the array.
 */
int incfs_collect_pending_reads(struct mount_info *mi, int sn_lowerbound,
				struct incfs_pending_read_info *reads,
				struct incfs_pending_read_info2 *reads2,
				int reads_size, int *new_max_sn);

int incfs_collect_logged_reads(struct mount_info *mi,
			       struct read_log_state *start_state,
			       struct incfs_pending_read_info *reads,
			       struct incfs_pending_read_info2 *reads2,
			       int reads_size);
struct read_log_state incfs_get_log_state(struct mount_info *mi);
int incfs_get_uncollected_logs_count(struct mount_info *mi,
				     const struct read_log_state *state);

static inline struct inode_info *get_incfs_node(struct inode *inode)
{
	if (!inode)
		return NULL;

	if (inode->i_sb->s_magic != INCFS_MAGIC_NUMBER) {
		/* This inode doesn't belong to us. */
		pr_warn_once("incfs: %s on an alien inode.", __func__);
		return NULL;
	}

	return container_of(inode, struct inode_info, n_vfs_inode);
}

static inline struct data_file *get_incfs_data_file(struct file *f)
{
	struct inode_info *node = NULL;

	if (!f)
		return NULL;

	if (!S_ISREG(f->f_inode->i_mode))
		return NULL;

	node = get_incfs_node(f->f_inode);
	if (!node)
		return NULL;

	return node->n_file;
}

static inline struct dir_file *get_incfs_dir_file(struct file *f)
{
	if (!f)
		return NULL;

	if (!S_ISDIR(f->f_inode->i_mode))
		return NULL;

	return (struct dir_file *)f->private_data;
}

/*
 * Make sure that inode_info.n_file is initialized and inode can be used
 * for reading and writing data from/to the backing file.
 */
int make_inode_ready_for_data_ops(struct mount_info *mi,
				struct inode *inode,
				struct file *backing_file);

static inline struct dentry_info *get_incfs_dentry(const struct dentry *d)
{
	if (!d)
		return NULL;

	return (struct dentry_info *)d->d_fsdata;
}

static inline void get_incfs_backing_path(const struct dentry *d,
					  struct path *path)
{
	struct dentry_info *di = get_incfs_dentry(d);

	if (!di) {
		*path = (struct path) {};
		return;
	}

	*path = di->backing_path;
	path_get(path);
}

static inline int get_blocks_count_for_size(u64 size)
{
	if (size == 0)
		return 0;
	return 1 + (size - 1) / INCFS_DATA_FILE_BLOCK_SIZE;
}

#endif /* _INCFS_DATA_MGMT_H */